%md
# Notebook to synch Permissions from source to destination UC
---
This notebook is going to synchronize permissons between source workspace and destination workspace objects

Source Workspace : uri to source workspace

Destination Workspace : uri to destination workspace


In [0]:
dbutils.widgets.text("source_workspace", "", "Source Workspace")
dbutils.widgets.text("destination_workspace", "", "Destination Workspace")


In [0]:
source_workspace = dbutils.widgets.get("source_workspace")
destination_workspace = dbutils.widgets.get("destination_workspace")

In [0]:
# get the credentials for sp from a keyvault backed secret scope
clientid = dbutils.secrets.get('kvbacked', 'clientid')
clientsecret = dbutils.secrets.get('kvbacked', 'clientsecret')
tenantid = dbutils.secrets.get('kvbacked', 'tenantid')

In [0]:
#variables
# catalogs to copy is the catalog name with the target storage root
cats_to_copy = [{
    "catadb360dev" : {
                        "schemas" : [
                            {"schema" : "schemaadb360dev"},
                            {"schema" : "silverdb"},
                            {"schema" : "golddb"}
                        ]
       }
    }              
]

max_executors = 5

In [0]:
# get the catalogs to work on
catalog_names = [list(cat.keys())[0] for cat in cats_to_copy]

In [0]:
# for a specific catalog, get the schemas
catalog_name = "catadb360dev"
schema_names = [schema["schema"] for cat in cats_to_copy if catalog_name in cat for schema in cat[catalog_name]["schemas"]]

In [0]:
# imports
from databricks.sdk import WorkspaceClient
from databricks.sdk.errors.platform import ResourceAlreadyExists, BadRequest, NotFound
from databricks.sdk.service import catalog
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat

In [0]:
# creating the sdk connection to source and destination workspace
sourceWs = WorkspaceClient(azure_client_id=clientid, azure_client_secret=clientsecret, azure_tenant_id=tenantid, host=source_workspace)
destWs = WorkspaceClient(azure_client_id=clientid, azure_client_secret=clientsecret, azure_tenant_id=tenantid, host=destination_workspace)

In [0]:
# get tables and volumes from sourcews
table_info_df = spark.sql("select * from system.information_schema.tables")
volume_info_df = spark.sql("select * from system.information_schema.volumes")


In [0]:
# synch helper function
def sync_grants(w_src: WorkspaceClient , w_tgt: WorkspaceClient, obj_name: str, obj_type: str):
    # get source and target grants
    source_grants = w_src.grants.get_effective(obj_type, obj_name)

    # if the object does not exist in the secondary workspace, we cannot fetch it
    try:
        target_grants = w_tgt.grants.get_effective(obj_type, obj_name)
    except NotFound:
        return {"name": obj_name, "status": "NotFound"}

    # get list of all distinct users with grants on the object
    user_list = {u.principal for u in source_grants.privilege_assignments}.union(
        {u.principal for u in target_grants.privilege_assignments})

    # create PermissionsChange object for each user where a change exists
    change_list = []
    for u in user_list:
        # get the source/target privileges; these may not exist in one or the other environment
        try:
            source_privs = [x.privilege for x in
                            [p.privileges for p in source_grants.privilege_assignments if p.principal == u][0]
                            if x.privilege is not None]
        except IndexError:
            source_privs = []

        try:
            target_privs = [x.privilege for x in
                            [p.privileges for p in target_grants.privilege_assignments if p.principal == u][0]
                            if x.privilege is not None]
        except IndexError:
            target_privs = []

        add_perms = list(set(source_privs) - set(target_privs))
        rem_perms = list(set(target_privs) - set(source_privs))

        # for the change list based on which types of changes exist
        if add_perms and rem_perms:
            change_list.append(catalog.PermissionsChange(
                add=add_perms,
                remove=rem_perms,
                principal=u))
        elif add_perms:
            change_list.append(catalog.PermissionsChange(
                add=add_perms,
                principal=u))
        elif rem_perms:
            change_list.append(catalog.PermissionsChange(
                remove=rem_perms,
                principal=u))

    # if any grants changed, update the object in target
    if change_list:
        w_tgt.grants.update(full_name=obj_name,
                            securable_type=obj_type,
                            changes=change_list)
        return {"name": obj_name, "status": "SUCCESS"}
    else:
        return {"name": obj_name, "status": None}


In [0]:
# main loop
for catalog_name in catalog_names:
    filtered_tables = table_info_df.filter(
        (table_info_df.table_catalog == catalog_name) &
        (table_info_df.table_schema != 'information_schema')).collect()
    
    filtered_volumes = volume_info_df.filter(volume_info_df.volume_catalog == catalog_name).collect()

    # synch catalog grants
    result = sync_grants(sourceWs, destWs, catalog_name, catalog.SecurableType.CATALOG)

    if result["status"] == "NotFound":
        print(f'Error: catalog {catalog_name} not found in target workspace')
    elif result["status"] == "SUCCESS":
        print(f'Catalog {catalog_name} grants synched successfully')
    else:
        print(f'No Changes for  catalog {catalog_name}')


    # get list of schemas and tables and volumes
    schemas = {f'{catalog_name}.{schema}' for schema in [row['table_schema'] for row in filtered_tables]}
    table_names = [f"{catalog_name}.{schema}.{table}" for schema, table in
                   zip([row['table_schema'] for row in filtered_tables],
                       [row['table_name'] for row in filtered_tables])]
    volume_names = [f"{catalog_name}.{schema}.{table}" for schema, table in
                    zip([row['volume_schema'] for row in filtered_volumes],
                        [row['volume_name'] for row in filtered_volumes])]
    
    # synch permissions to schemas
    for schema in schemas:
        result = sync_grants(sourceWs, destWs, schema, catalog.SecurableType.SCHEMA)
        if result["status"] == "NotFound":
            print(f'Error: catalog {schema} not found in target workspace')
        elif result["status"] == "SUCCESS":
            print(f'Catalog {schema} grants synched successfully')
        else:
            print(f'No Changes for  catalog {schema}')     

    # synch permissions for tables
    with ThreadPoolExecutor(max_workers=max_executors) as executor:  
         threads = executor.map(
             sync_grants, 
             repeat(sourceWs), 
             repeat(destWs),
             table_names,
             repeat(catalog.SecurableType.TABLE)
         )

    for result in threads:
        name = result['name']
        if result["status"] == "SUCCESS":
            print(f'synched grants for table {name}')
        elif result["status"] == "NotFound":
            print(f'Error: table {name} not found in target workspace')
        else:
            print(f'No Changes for table {name}')


    # synch permission for volumes
    for volume_name in volume_names:
        result = sync_grants(sourceWs, destWs, volume_name, catalog.SecurableType.VOLUME)
        if result["status"] == "NotFound":
            print(f'Error: volume {volume_name} not found in target workspace')
        elif result["status"] == "SUCCESS":
            print(f'volume {volume_name} grants synched successfully')
        else:
            print(f'No Changes for  volume {volume_name}')  


print("finished synching grants")


In [0]:
print(volume_names)