## Pull in UDFs

In [1]:
%run nb_udfs

StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 13, Finished, Available, Finished)

## Run Workspaces

In [2]:
workspace = 'Admin%20Center' #have to escape the & symbol and spaces
lakehouse = 'lh_monitoring'

ws_table = 'dimWorkspaces'
workspace_users_table = 'dimWorkspaceUsers'
current_user = mssparkutils.env.getUserName()

StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 14, Finished, Available, Finished)

In [3]:
#get a list of all workspaces and load to a table in the lakehouse
response = fab_client.get(f"/v1/admin/workspaces")
df_workspaces = pd.json_normalize(response.json()['workspaces'])
#df_workspaces
df_workspaces = spark.createDataFrame(df_workspaces)


StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 15, Finished, Available, Finished)

In [4]:
#creates a slowly changing dimension so we can keep an eye on any deleted workspaces
workspacePath = udf_GetFilePath(workspace, lakehouse, ws_table)
#print(workspacePath)
naturalKeyColumnList = ['id']
primaryKeyColumnName = "tableId"
returnVal = udf_UpsertDimension(df_workspaces,2,workspacePath,naturalKeyColumnList,primaryKeyColumnName,True)
print(returnVal)

StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 16, Finished, Available, Finished)

✅ Upsert complete
{'startTime': '2025-10-07 22:01:11.020843', 'stopTime': '2025-10-07 22:01:40.706551', 'details': '417 records updated, 417 records inserted from 417 staging rows to abfss://e54b972a-76a7-4a96-90ab-77441da0157e@onelake.dfs.fabric.microsoft.com/9b744bc6-b68b-4136-9983-4a665a8d5c9c/Tables/dimWorkspaces'}


## Get the Workspace Users

In [5]:
#creates a list of workspaces we want users for
df_np_workspaces = df_workspaces \
    .filter(df_workspaces["type"] == "Workspace")  \
    .filter(df_workspaces["state"] =="Active")

StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 17, Finished, Available, Finished)

In [6]:
def process_workspace_users(row):
    workspace_id = row['id']
    workspace_name = row['name']
    try:
        response = _base_api(
                request=f"/v1.0/myorg/groups/{workspace_id}/users",
                #admin rest api only allows for 200 calls an hour. Switching to non admin so we can go through all workspaces
                #request=f"/v1.0/myorg/admin/groups/{workspace_id}/users",
                method="get"
            )
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 3600))
            print(f"⛔️ Hit 429 rate limit. Sleeping for {retry_after} seconds...")
            time.sleep(retry_after)
            # Retry once after sleeping
            response = _base_api(
                request=f"/v1.0/myorg/groups/{workspace_id}/users",
                #admin rest api only allows for 200 calls an hour. Switching to non admin so we can go through all workspaces much faster
                #request=f"/v1.0/myorg/admin/groups/{workspace_id}/users",
                method="get"
            )
    except Exception as e:
        print(f"❌ Exception on workspace {workspace_name} ({workspace_id}): {str(e)}")
        return {'workspace_users': None}

     # ✅ Add this check to prevent KeyError
    if 'value' not in response.json():
        print(f"⚠️ No 'value' key in response for workspace {workspace_name} ({workspace_id})")
        print(f"🔍 Status Code: {response.status_code} Full response: {response.json()}")
        return {'workspace_users': None}
    #print(response.json())
    df_workspace_users = pd.json_normalize(response.json()['value'])
    df_workspace_users = spark.createDataFrame(df_workspace_users)
    df_workspace_users = df_workspace_users.withColumn("workspaceId",lit(workspace_id))
    df_workspace_users = df_workspace_users.withColumn("compositeKey",concat(df_workspace_users["workspaceId"],df_workspace_users["emailAddress"]))
    #df_workspace_users.printSchema()
    #print(df_workspace_users.columns)

    return {'workspace_users': df_workspace_users}






StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 18, Finished, Available, Finished)

In [7]:

rows_workspaces = df_np_workspaces.collect()
results_workspace_users = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures_wu = {executor.submit(process_workspace_users,row): row for row in rows_workspaces}
    for future in as_completed(futures_wu):
        result = future.result()
        try:
            # If the function returned a non-null Spark DataFrame, add it to the list
            if result["workspace_users"] is not None:
                results_workspace_users.append(result["workspace_users"])
        except Exception as e:
            print(f"❌ Failed to get users for workspace {row['workspaceName']}: {e}")
# Once all threads are done, print a completion message
print('✅ Looping complete!')


StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 19, Finished, Available, Finished)

✅ Looping complete!


In [8]:
#union the results together
if results_workspace_users:
    df_workspace_users = union_batches(results_workspace_users,batch_size=50)
    print("✅ workspace users dataframe ready")
else:
    print("⚠️ No workspace users found.")

#creates a slowly changing dimension so we can keep an eye on any removed workspace users
workspaceUserPath = udf_GetFilePath(workspace, lakehouse, workspace_users_table)
#print(workspaceUserPath)
naturalKeyColumnList = ['compositeKey']
primaryKeyColumnName = "tableId"
returnVal = udf_UpsertDimension(df_workspace_users,2,workspaceUserPath,naturalKeyColumnList,primaryKeyColumnName,True)
print(returnVal)

StatementMeta(, d9dc0eae-d1e7-487a-8529-9ba990563561, 20, Finished, Available, Finished)

✅ workspace users dataframe ready
📦 Initial load complete
{'startTime': '2025-10-07 22:02:45.978077', 'stopTime': '2025-10-07 22:02:49.577612', 'details': '0 records updated, 60 records inserted from 60 staging rows to abfss://e54b972a-76a7-4a96-90ab-77441da0157e@onelake.dfs.fabric.microsoft.com/9b744bc6-b68b-4136-9983-4a665a8d5c9c/Tables/dimWorkspaceUsers'}
