## Pull in UDFs

In [1]:
%run nb_udfs

StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 13, Finished, Available, Finished)

## Run Workspaces

In [2]:
workspace = 'BI%20%26%20Analytics%20Administration' 
lakehouse = 'lh_tenant_data'
ws_table = 'dimWorkspaces'
workspace_users_table = 'dimWorkspaceUsers'
current_user = mssparkutils.env.getUserName()

StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 14, Finished, Available, Finished)

In [3]:
#get a list of all workspaces and load to a table in the lakehouse
response = fab_client.get(f"/v1/admin/workspaces")
df_workspaces = pd.json_normalize(response.json()['workspaces'])
#df_workspaces
df_workspaces = spark.createDataFrame(df_workspaces)


StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 15, Finished, Available, Finished)

In [4]:
#creates a slowly changing dimension so we can keep an eye on any deleted workspaces
workspacePath = udf_GetFilePath(workspace, lakehouse, ws_table)
#print(workspacePath)
naturalKeyColumnList = ['id']
primaryKeyColumnName = "tableId"
returnVal = udf_UpsertDimension(df_workspaces,2,workspacePath,naturalKeyColumnList,primaryKeyColumnName,True)
print(returnVal)

StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 16, Finished, Available, Finished)

‚úÖ Upsert complete
{'startTime': '2025-11-23 04:01:16.040607', 'stopTime': '2025-11-23 04:01:58.322403', 'details': '0 records updated, 0 records inserted from 8774 staging rows to abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimWorkspaces'}


## Get the Workspace Users

In [15]:
#creates a list of workspaces we want users for
#print(df_workspaces.count())

df_np_workspaces = df_workspaces \
    .filter(df_workspaces["type"] == "Workspace") \
    .filter(df_workspaces["state"] == "Active")

#print(df_np_workspaces.count())

StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 27, Finished, Available, Finished)

8774
789


In [16]:
def process_workspace_users(row):
    workspace_id = row['id']
    workspace_name = row['name']
    try:
        response = _base_api(
                request=f"/v1.0/myorg/groups/{workspace_id}/users",
                #admin rest api only allows for 200 calls an hour. Switching to non admin so we can go through all workspaces
                #request=f"/v1.0/myorg/admin/groups/{workspace_id}/users",
                method="get"
            )
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 3600))
            print(f"‚õîÔ∏è Hit 429 rate limit. Sleeping for {retry_after} seconds...")
            time.sleep(retry_after)
            # Retry once after sleeping
            response = _base_api(
                request=f"/v1.0/myorg/groups/{workspace_id}/users",
                #admin rest api only allows for 200 calls an hour. Switching to non admin so we can go through all workspaces much faster
                #request=f"/v1.0/myorg/admin/groups/{workspace_id}/users",
                method="get"
            )
    except Exception as e:
        print(f"‚ùå Exception on workspace {workspace_name} ({workspace_id}): {str(e)}")
        return {'workspace_users': None}

    # --- Handle other non-success statuses ---
    status = response.status_code
    if status is None or status  < 200 or status >= 300:
        print(
            f"‚ö†Ô∏è Non-success status for workspace {workspace_name} "
            f"({workspace_id}): {status}"
        )
        # response.text is safe even if it's not JSON
        try:
            print(f"Response body: {response.text}")
        except Exception:
            pass
        return {'workspace_users': None}

    # --- At this point we *expect* JSON, but still guard it ---
    try:
        data = response.json()
    except ValueError:
        print(
            f"‚ö†Ô∏è Failed to parse JSON for workspace {workspace_name} "
            f"({workspace_id}). Raw response:"
        )
        try:
            print(response.text[:500])
        except Exception:
            pass
        return {'workspace_users': None}

     # ‚úÖ Add this check to prevent KeyError
    if 'value' not in response.json():
        print(f"‚ö†Ô∏è No 'value' key in response for workspace {workspace_name} ({workspace_id})")
        print(f"üîç Status Code: {response.status_code} Full response: {response.json()}")
        return {'workspace_users': None}
    #print(response.json())
    df_workspace_users = pd.json_normalize(response.json()['value'])
    df_workspace_users = spark.createDataFrame(df_workspace_users)
    df_workspace_users = df_workspace_users.withColumn("workspaceId",lit(workspace_id))
    df_workspace_users = df_workspace_users.withColumn("compositeKey",concat(df_workspace_users["workspaceId"],df_workspace_users["emailAddress"]))
    #df_workspace_users.printSchema()
    #print(df_workspace_users.columns)

    return {'workspace_users': df_workspace_users}


StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 28, Finished, Available, Finished)

In [17]:

rows_workspaces = df_np_workspaces.collect()
results_workspace_users = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures_wu = {executor.submit(process_workspace_users,row): row for row in rows_workspaces}
    for future in as_completed(futures_wu):
        result = future.result()
        try:
            # If the function returned a non-null Spark DataFrame, add it to the list
            if result["workspace_users"] is not None:
                results_workspace_users.append(result["workspace_users"])
        except Exception as e:
            print(f"‚ùå Failed to get users for workspace {row['workspaceName']}: {e}")
# Once all threads are done, print a completion message
print('‚úÖ Looping complete!')


StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 29, Finished, Available, Finished)

‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 23 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. Sleeping for 52 seconds...
‚õîÔ∏è Hit 429 rate limit. S

In [18]:
#union the results together
if results_workspace_users:
    df_workspace_users = union_batches(results_workspace_users,batch_size=50)
    print("‚úÖ workspace users dataframe ready")
else:
    print("‚ö†Ô∏è No workspace users found.")

#creates a slowly changing dimension so we can keep an eye on any removed workspace users
workspaceUserPath = udf_GetFilePath(workspace, lakehouse, workspace_users_table)
#print(workspaceUserPath)
naturalKeyColumnList = ['compositeKey']
primaryKeyColumnName = "tableId"
# do not make this a type 2 dimension! It gets way too big and consumes significant capacity. 
returnVal = udf_UpsertDimension(df_workspace_users,1,workspaceUserPath,naturalKeyColumnList,primaryKeyColumnName,True)
print(returnVal)

StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 30, Finished, Available, Finished)

‚úÖ workspace users dataframe ready
abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimWorkspaceUsers
üì¶ Initial load complete
{'startTime': '2025-11-23 04:27:41.805536', 'stopTime': '2025-11-23 04:29:02.759917', 'details': '0 records updated, 7803 records inserted from 7803 staging rows to abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimWorkspaceUsers'}


In [20]:
spark.sql(f"""
OPTIMIZE delta.`{workspaceUserPath}`
ZORDER BY (ETLHashKey)
""")
print(f'Optimized {workspaceUserPath}')


StatementMeta(, 638abc08-0ae1-4880-a164-313084bf003b, 32, Finished, Available, Finished)

Optimized abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimWorkspaceUsers
