# AI/BI Adoption Dashboard - Metadata Collection

This notebook collects metadata from various Databricks AI/BI services using the Databricks SDK.

## Tables Created

This notebook creates the following Delta tables:

### Genie
* `adb_genie_spaces` - All Genie spaces
* `adb_genie_conversations` - All conversations within Genie spaces
* `adb_genie_messages` - All messages with query results and errors

### Dashboards
* `adb_dashboards` - All Lakeview dashboards
* `adb_dashboard_schedules` - Dashboard schedules
* `adb_dashboard_subscriptions` - Dashboard subscriptions

### Models & Serving
* `adb_models` - Unity Catalog registered models
* `adb_serving_endpoints` - Model serving endpoints

### Apps
* `adb_apps` - Databricks Apps

## Parameters
* `catalog_name` - Target catalog for tables
* `schema_name` - Target schema for tables

In [0]:
%pip install databricks_sdk --upgrade

In [0]:
dbutils.library.restartPython()

In [0]:
dbutils.widgets.text("catalog_name", "users")
dbutils.widgets.text("schema_name", "")
dbutils.widgets.text("skip_get_conversations", "true")

skip_get_conversations = dbutils.widgets.get("skip_get_conversations").lower() == 'true'
catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
assert catalog_name and schema_name, "catalog_name and schema_name must be provided"


In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime
import pandas as pd
import json
import pyspark.sql.functions as F 

w = WorkspaceClient()

## Genie Spaces, Conversations, and Messages

This section collects metadata about Genie spaces and their usage:
* **adb_genie_spaces**: All Genie spaces in the workspace
* **adb_genie_conversations**: All conversations within each Genie space
* **adb_genie_messages**: All messages within each conversation, including query results and errors

In [0]:
spaces = []
page_token = None

# w = WorkspaceClient()

while True:
    response = w.genie.list_spaces(page_token=page_token)
    for s in response.spaces:
        spaces.append({
            "space_id": getattr(s, "space_id", None),
            "name": getattr(s, "title", None),
            "description": getattr(s, "description", None),
            "warehouse_id": getattr(s, "warehouse_id", None)
        })
    if not response.next_page_token or response.next_page_token == "":
        break
    page_token = response.next_page_token

pdf = pd.DataFrame(spaces)
if not pdf.empty:
    genie_spaces_df = spark.createDataFrame(pdf)
    genie_spaces_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_genie_spaces"
    )
    print(f"Loaded {genie_spaces_df.count()} Genie spaces into table adb_genie_spaces")
else:
    print("No Genie spaces found.")

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime
import pandas as pd

#w = WorkspaceClient()

conversations = []
has_manage_on_all_spaces = True
include_all_conversations = has_manage_on_all_spaces
# Get all space IDs from the previously created table
space_ids = [
    row.space_id
    for row in spark.table(f"{catalog_name}.{schema_name}.adb_genie_spaces").select('space_id').collect()
]

print(f"Fetching conversations for {len(space_ids)} Genie spaces...")

for space_id in space_ids:
    try:
        page_token = None
        while True:
            response = w.genie.list_conversations(space_id=space_id, include_all=include_all_conversations, page_token=page_token)
            for conv in response.conversations:
                conv_dict = conv.as_dict()
                conv_dict['space_id'] = space_id
                conversations.append(conv_dict)

            if not response.next_page_token or response.next_page_token == "":
                break
            page_token = response.next_page_token
            
    except Exception as e:
        print(f"Error fetching conversations for space {space_id}: {e}")
        continue

if conversations:
    genie_conversations_df = spark.createDataFrame(conversations).withColumn('created_timestamp', F.from_unixtime(F.col('created_timestamp')/1000).cast('timestamp'))
    genie_conversations_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_genie_conversations"
    )
    print(f"Loaded {genie_conversations_df.count()} Genie conversations into table adb_genie_conversations")
else:
    print("No Genie conversations found.")

In [0]:
#w = WorkspaceClient()

messages = []

# Get all space_id and conversation_id pairs from the previously created table
conversation_pairs = [
    (row.space_id, row.conversation_id)
    for row in spark.table(f"{catalog_name}.{schema_name}.adb_genie_conversations").select("space_id", "conversation_id").collect()
]

print(f"Fetching messages for {len(conversation_pairs)} conversations...")

for space_id, conversation_id in conversation_pairs:
    if skip_get_conversations:
        print('Skipping conversations pull as skip_get_conversations is set to true')
        break
    try:
        page_token = None
        while True:
            response = w.genie.list_conversation_messages(
                space_id=space_id,
                conversation_id=conversation_id,
                page_token=page_token
            )
            if not response.messages:
                break
            
            for msg in response.messages:
                # Extract query info from attachments
                statement_id = None
                query_text = None
                row_count = None
                
                if msg.attachments:
                    for attachment in msg.attachments:
                        if attachment.query:
                            statement_id = attachment.query.statement_id
                            query_text = attachment.query.query
                            if attachment.query.query_result_metadata:
                                row_count = attachment.query.query_result_metadata.row_count
                            break  # Use first query attachment
                
                # Extract error info if available
                error_message = None
                if msg.error:
                    error_message = msg.error.error
                
                messages.append({
                    "space_id": space_id,
                    "conversation_id": conversation_id,
                    "message_id": msg.message_id,
                    "content": msg.content,
                    "user_id": msg.user_id,
                    "created_timestamp": msg.created_timestamp,
                    "last_updated_timestamp": msg.last_updated_timestamp,
                    "status": msg.status.value if msg.status else None,
                    "num_attachments": len(msg.attachments) if msg.attachments else 0,
                    "statement_id": statement_id,
                    "query_text": query_text,
                    "row_count": row_count,
                    "error_message": error_message
                })
            
            if not response.next_page_token or response.next_page_token == "":
                break
            page_token = response.next_page_token
            
    except Exception as e:
        print(f"Error fetching messages for conversation {conversation_id} in space {space_id}: {e}")
        continue

if messages:
    genie_messages_df = spark.createDataFrame(messages, 'space_id string, conversation_id string, message_id string, content string, user_id string, created_timestamp bigint, last_updated_timestamp bigint, status string, num_attachments int, statement_id string, query_text string, row_count string, error_message string') \
        .withColumn('created_timestamp', F.from_unixtime(F.col('created_timestamp')/1000).cast('timestamp')) \
        .withColumn('last_updated_timestamp', F.from_unixtime(F.col('last_updated_timestamp')/1000).cast('timestamp'))
    genie_messages_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_genie_messages"
    )
    print(f"Loaded {genie_messages_df.count()} Genie messages into table adb_genie_messages")
else:
    print("No Genie messages found or process skipped.")

## Lakeview Dashboards, Schedules, and Subscriptions

This section collects metadata about Lakeview dashboards and their distribution:
* **adb_dashboards**: All Lakeview dashboards in the workspace
* **adb_dashboard_schedules**: Scheduled refresh/distribution for dashboards
* **abd_dashboard_subscriptions**: User and destination subscriptions for dashboard schedules

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime

#w = WorkspaceClient()

rows = []
for d in w.lakeview.list(page_size=100):
    d_dict = d.as_dict()
    # Convert timestamps to Python datetime if they are not already
    create_time = d_dict.get("create_time")
    update_time = d_dict.get("update_time")
    if isinstance(create_time, str):
        try:
            create_time = datetime.fromisoformat(create_time)
        except Exception:
            create_time = None
    if isinstance(update_time, str):
        try:
            update_time = datetime.fromisoformat(update_time)
        except Exception:
            update_time = None
    rows.append((
        d_dict.get("dashboard_id"),
        d_dict.get("display_name"),
        create_time,
        d_dict.get("lifecycle_state"),
        update_time,
        d_dict.get("warehouse_id")
    ))

from pyspark.sql.types import StructType, StructField, StringType, TimestampType
schema = StructType([
    StructField('dashboard_id', StringType(), True),
    StructField('display_name', StringType(), True),
    StructField('create_time', TimestampType(), True),
    StructField('lifecycle_state', StringType(), True),
    StructField('update_time', TimestampType(), True),
    StructField('warehouse_id', StringType(), True)
])

df = spark.createDataFrame(
    rows,
    schema
)
df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{catalog_name}.{schema_name}.adb_dashboards")
#display(df)

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime

schedules = []

dashboard_ids = [
    row.dashboard_id
    for row in spark.table(f"{catalog_name}.{schema_name}.adb_dashboards").select("dashboard_id").collect()
]

for dashboard_id in dashboard_ids:
    try:
        for sched in w.lakeview.list_schedules(dashboard_id=dashboard_id):
            sched_dict = sched.as_dict()
            subscriber = sched_dict.get("subscriber", {})
            destination_subscriber = subscriber.get("destination_subscriber")
            user_subscriber = subscriber.get("user_subscriber")
            schedules.append({
                "dashboard_id": dashboard_id,
                "schedule_id": sched_dict.get("schedule_id"),
                "create_time": sched_dict.get("create_time"),
                "display_name": sched_dict.get("display_name"),
                "pause_status": sched_dict.get("pause_status")
            })
    except Exception as e:
        # Optionally log or print the dashboard_id that failed
        continue

if schedules:
    import pandas as pd
    pdf_sched = pd.DataFrame(schedules)
    spark_df_sched = spark.createDataFrame(pdf_sched)
    spark.sql(f"DROP TABLE IF EXISTS {catalog_name}.{schema_name}.adb_dashboard_schedules")
    spark_df_sched.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{catalog_name}.{schema_name}.adb_dashboard_schedules")
    #display(spark_df_sched)

In [0]:
# Databricks notebook Python
from databricks.sdk import WorkspaceClient
import pandas as pd

source_table = f"{catalog_name}.{schema_name}.adb_dashboards"
target_table = f"{catalog_name}.{schema_name}.adb_dashboard_subscriptions"  # as requested

#w = WorkspaceClient()

# ---------- helpers ----------
def get_field(obj_or_dict, *names):
    """Return the first non-None among possible field names (works for dicts or objects)."""
    for n in names:
        if isinstance(obj_or_dict, dict):
            if n in obj_or_dict and obj_or_dict[n] is not None:
                return obj_or_dict[n]
        else:
            v = getattr(obj_or_dict, n, None)
            if v is not None:
                return v
    return None

def to_dict_safe(x):
    return (getattr(x, "as_dict", lambda: {})() or {}) if x is not None else {}

# ---------- get dashboard ids ----------
#dash_df = spark.table(source_table).select("dashboard_id").distinct().collect()
sched_df = spark.table(f"{catalog_name}.{schema_name}.adb_dashboard_schedules").collect()
rows = []
count = 0

for r in sched_df:  # avoids collecting everything to driver at once
    dashboard_id = r["dashboard_id"]
    schedule_id = r["schedule_id"]
    try:
                    # enumerate subscriptions for this schedule
        for sub in w.lakeview.list_subscriptions(dashboard_id=dashboard_id, schedule_id=schedule_id):
                subdict = to_dict_safe(sub)
                subscriber  = get_field(subdict, "subscriber") or get_field(sub, "subscriber") or {}

                user_subscriber = get_field(subscriber, "user_subscriber")
                destination_subscriber = get_field(subscriber, "destination_subscriber")

                user_id = (
                    get_field(user_subscriber or {}, "user_id", "id")
                    if user_subscriber is not None else None
                )

                subscription_id = get_field(subdict, "subscription_id") or get_field(sub, "subscription_id")

                # capture destination info if present (email/slack/webhook/etc.)
                destination_type = get_field(destination_subscriber or {}, "destination_type", "type")
                destination_id   = get_field(destination_subscriber or {}, "destination_id", "destination", "id")

                rows.append({
                    "dashboard_id": dashboard_id,
                    "schedule_id": schedule_id,
                    "subscription_id": subscription_id,
                    "create_time": create_time,
                    "user_id": user_id,
                    "destination_id": destination_id,
                })
                count += 1

    except Exception as e:
        # If youâ€™d prefer to see failures in the table, uncomment the append below
        # rows.append({"dashboard_id": dashboard_id, "error": str(e)})
        continue

print(f"Collected {count} subscription rows across dashboards.")

# ---------- write to Delta table ----------
pdf = pd.DataFrame(rows)

# If there are no rows, create an empty DF with the expected schema so the table still exists
if pdf.empty:
    pdf = pd.DataFrame(columns=[
        "dashboard_id","schedule_id","subscription_id","create_time",
        "user_id","destination_id"
    ])

spark.sql(f"DROP TABLE IF EXISTS {target_table}")
spark_df = spark.createDataFrame(pdf)

# Optional: try to cast timestamps/ids nicely (depends on your upstream data)
from pyspark.sql import functions as F
spark_df = (
    spark_df
    .withColumn("create_time", F.to_timestamp("create_time"))
    .select("dashboard_id", "schedule_id", "subscription_id", "create_time", "user_id", "destination_id")
)

spark_df.write.format("delta").mode("overwrite").saveAsTable(target_table)

#display(spark.table(target_table).limit(50))

## Unity Catalog Models

This section collects metadata about registered models in Unity Catalog:
* **adb_models**: All registered models with permissions and metadata

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime
import pandas as pd

#w = WorkspaceClient()

models = []

try:
    for model in w.registered_models.list():
        model_dict = model.as_dict() if hasattr(model, 'as_dict') else {}
        
        # Get permissions summary
        num_users_with_access = 0
        num_groups_with_access = 0
        try:
            full_name = model_dict.get('full_name') or f"{model_dict.get('catalog_name', '')}.{model_dict.get('schema_name', '')}.{model_dict.get('name', '')}"
            if full_name and full_name != '..':
                perms = w.registered_models.get_permissions(full_name)
                if perms and hasattr(perms, 'access_control_list'):
                    acl = perms.access_control_list or []
                    for entry in acl:
                        if hasattr(entry, 'user_name') and entry.user_name:
                            num_users_with_access += 1
                        if hasattr(entry, 'group_name') and entry.group_name:
                            num_groups_with_access += 1
        except Exception as e:
            # Permissions may not be accessible, continue without them
            pass
        
        # Convert timestamps
        created_at = model_dict.get('created_at')
        updated_at = model_dict.get('updated_at')
        if isinstance(created_at, (int, float)) and created_at > 0:
            try:
                created_at = datetime.fromtimestamp(created_at / 1000)
            except Exception:
                created_at = None
        if isinstance(updated_at, (int, float)) and updated_at > 0:
            try:
                updated_at = datetime.fromtimestamp(updated_at / 1000)
            except Exception:
                updated_at = None
        
        models.append({
            "full_name": model_dict.get('full_name'),
            "name": model_dict.get('name'),
            "catalog_name": model_dict.get('catalog_name'),
            "schema_name": model_dict.get('schema_name'),
            "created_at": created_at,
            "created_by": model_dict.get('created_by'),
            "updated_at": updated_at,
            "updated_by": model_dict.get('updated_by'),
            "owner": model_dict.get('owner'),
            "comment": model_dict.get('comment'),
            "num_users_with_access": num_users_with_access,
            "num_groups_with_access": num_groups_with_access
        })
except Exception as e:
    print(f"Error listing models: {e}")

pdf = pd.DataFrame(models)
if not pdf.empty:
    spark_df = spark.createDataFrame(pdf)
    spark_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_models"
    )
    print(f"Loaded {spark_df.count()} models into table adb_models")
else:
    print("No models found.")


## Model Serving Endpoints

This section collects metadata about model serving endpoints:
* **adb_serving_endpoints**: All serving endpoints with their models and permissions

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime
import pandas as pd

#w = WorkspaceClient()

serving_endpoints = []

try:
    for endpoint in w.serving_endpoints.list():
        endpoint_dict = endpoint.as_dict() if hasattr(endpoint, 'as_dict') else {}
        
        # Get permissions summary
        num_users_with_access = 0
        num_groups_with_access = 0
        try:
            endpoint_name = endpoint_dict.get('name')
            if endpoint_name:
                perms = w.serving_endpoints.get_permissions(endpoint_name)
                if perms and hasattr(perms, 'access_control_list'):
                    acl = perms.access_control_list or []
                    for entry in acl:
                        if hasattr(entry, 'user_name') and entry.user_name:
                            num_users_with_access += 1
                        if hasattr(entry, 'group_name') and entry.group_name:
                            num_groups_with_access += 1
        except Exception as e:
            # Permissions may not be accessible, continue without them
            pass
        
        # Extract model info from config
        config = endpoint_dict.get('config', {})
        models_info = []
        if isinstance(config, dict):
            served_models = config.get('served_models', [])
            if isinstance(served_models, list):
                for model in served_models:
                    if isinstance(model, dict):
                        model_name = model.get('model_name') or model.get('name')
                        if model_name:
                            models_info.append(model_name)
        
        # Convert timestamps
        creation_timestamp = endpoint_dict.get('creation_timestamp')
        last_updated_timestamp = endpoint_dict.get('last_updated_timestamp')
        if isinstance(creation_timestamp, (int, float)) and creation_timestamp > 0:
            try:
                creation_timestamp = datetime.fromtimestamp(creation_timestamp / 1000)
            except Exception:
                creation_timestamp = None
        if isinstance(last_updated_timestamp, (int, float)) and last_updated_timestamp > 0:
            try:
                last_updated_timestamp = datetime.fromtimestamp(last_updated_timestamp / 1000)
            except Exception:
                last_updated_timestamp = None
        
        serving_endpoints.append({
            "name": endpoint_dict.get('name'),
            "id": endpoint_dict.get('id'),
            "creation_timestamp": creation_timestamp,
            "creator": endpoint_dict.get('creator'),
            "last_updated_timestamp": last_updated_timestamp,
            "state": endpoint_dict.get('state'),
            "models": ','.join(models_info) if models_info else None,
            "num_users_with_access": num_users_with_access,
            "num_groups_with_access": num_groups_with_access
        })
except Exception as e:
    print(f"Error listing serving endpoints: {e}")

pdf = pd.DataFrame(serving_endpoints)
if not pdf.empty:
    spark_df = spark.createDataFrame(pdf)
    spark_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_serving_endpoints"
    )
    print(f"Loaded {spark_df.count()} serving endpoints into table adb_serving_endpoints")
else:
    print("No serving endpoints found.")


## Databricks Apps

This section collects metadata about Databricks Apps:
* **adb_apps**: All apps with their status and permissions

In [0]:
from databricks.sdk import WorkspaceClient
from datetime import datetime
import pandas as pd

#w = WorkspaceClient()

apps = []

try:
    for app in w.apps.list():
        app_dict = app.as_dict() if hasattr(app, 'as_dict') else {}
        
        # Get permissions summary
        num_users_with_access = 0
        num_groups_with_access = 0
        try:
            app_name = app_dict.get('name')
            if app_name:
                perms = w.apps.get_permissions(app_name)
                if perms and hasattr(perms, 'access_control_list'):
                    acl = perms.access_control_list or []
                    for entry in acl:
                        if hasattr(entry, 'user_name') and entry.user_name:
                            num_users_with_access += 1
                        if hasattr(entry, 'group_name') and entry.group_name:
                            num_groups_with_access += 1
        except Exception as e:
            # Permissions may not be accessible, continue without them
            pass
        
        # Convert timestamps
        create_time = app_dict.get('create_time')
        update_time = app_dict.get('update_time')
        if isinstance(create_time, str):
            try:
                create_time = datetime.fromisoformat(create_time.replace('Z', '+00:00'))
            except Exception:
                create_time = None
        if isinstance(update_time, str):
            try:
                update_time = datetime.fromisoformat(update_time.replace('Z', '+00:00'))
            except Exception:
                update_time = None
        
        apps.append({
            "name": app_dict.get('name'),
            "id": app_dict.get('id'),
            "create_time": create_time,
            "creator": app_dict.get('creator'),
            "update_time": update_time,
            "updater": app_dict.get('updater'),
            "url": app_dict.get('url'),
            "app_status": app_dict.get('app_status'),
            "compute_status": app_dict.get('compute_status'),
            "num_users_with_access": num_users_with_access,
            "num_groups_with_access": num_groups_with_access
        })
except Exception as e:
    print(f"Error listing apps: {e}")

pdf = pd.DataFrame(apps)
if not pdf.empty:
    spark_df = spark.createDataFrame(pdf)
    spark_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(
        f"{catalog_name}.{schema_name}.adb_apps"
    )
    print(f"Loaded {spark_df.count()} apps into table adb_apps")
else:
    print("No apps found.")
