### Purpose
##### Goal: Load workspaces, datasets, reports, and activity events for the entire tenant
##### Notes:
- Only works if the owner/runner of the notebook is a tenant admin
- Will automatically assign the runner of the notebook to all workspaces in order to get the refresh information (no admin api available to run this)
##### Helpful Resources:
- https://learn.microsoft.com/en-us/python/api/semantic-link-sempy/sempy.fabric?view=semantic-link-python
- https://community.fabric.microsoft.com/t5/Data-Engineering-Community-Blog/Spark-Connector-for-Fabric-Warehouse-Unified-Analytics/ba-p/4611309
- https://stackoverflow.com/questions/71001110/power-bi-rest-api-requests-not-authorizing-as-expected
- To use KeyVault, refer to this article: https://darren.gosbell.com/2023/06/calling-a-power-bi-rest-api-from-a-fabric-notebook/

In [1]:
# Imports
import notebookutils as nb
from ast import literal_eval
import sempy.fabric as fabric
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime as dt
from builtins import filter as stdFilter
from notebookutils.mssparkutils.handlers.notebookHandler import RunMultipleFailedException
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import json, requests, pandas as pd 
from requests.exceptions import HTTPError
fab_client = fabric.FabricRestClient()

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 3, Finished, Available, Finished)

In [2]:
# Variables
workspace = 'Fabric of Middle-Earth' #have to escape the & symbol using %26% 
lakehouse = 'lh_the_hoard'


ws_table = 'dimWorkspaces'
workspace_users_table = 'dimWorkspaceUsers'
tenant_settings = 'dimTenantSettings'
capacities_table = 'dimCapacities'
dashboard_table = 'dimDashboards'
dataflow_table = 'dimDataflows'
report_table = 'dimReports'

dataset_table = 'dimSemanticModels'
refreshHist_table = 'factRefreshHistory'
refreshSched_table = 'factRefreshSchedule'

act_table = 'factActivities'
#defaults the activity data to grab yesterday's data. The API only goes back 28 days maximum.
date_offset = 1
current_user = mssparkutils.env.getUserName()


StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 4, Finished, Available, Finished)

## Tasks to Run in Parallel

In [3]:
#path is the name of the notebook to trigger
#args is where we pass parameters from this notebook into the sub notebooks

nb_to_run = [
    {"path": "nb_run_workspaces", "args":{"workspace":workspace,"lakehouse":lakehouse,"ws_table":ws_table,"workspace_users_table":workspace_users_table}} ,
    {"path": "nb_run_tenant_settings", "args":{"workspace":workspace,"lakehouse":lakehouse,"tenant_settings":tenant_settings}} ,
    {"path": "nb_run_capacities", "args":{"workspace":workspace,"lakehouse":lakehouse,"capacities_table":capacities_table}} ,
    {"path": "nb_run_dashboards", "args":{"workspace":workspace,"lakehouse":lakehouse,"dashboard_table":dashboard_table}} ,
    {"path": "nb_run_dataflows", "args":{"workspace":workspace,"lakehouse":lakehouse,"dataflow_table":dataflow_table}} ,
    {"path": "nb_run_reports", "args":{"workspace":workspace,"lakehouse":lakehouse,"report_table":report_table}} ,
    {"path": "nb_run_datasets", "args":{"workspace":workspace,"lakehouse":lakehouse,"dataset_table":dataset_table,"refreshHist_table":refreshHist_table,"refreshSched_table":refreshSched_table}} 
    ,{"path": "nb_run_activities", "args":{"workspace":workspace,"lakehouse":lakehouse,"act_table":act_table,"date_offset":date_offset}} 
    ]

# Use this to backload activity data if needed 
#nb_to_run = [{"path": "nb_run_activities", "args":{"workspace":workspace,"lakehouse":lakehouse,"act_table":act_table,"date_offset" = date_offset}}]

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 5, Finished, Available, Finished)

## Adding admin access for the current user. 
- NOTE: if there are more than 200 workspaces you need access to, we can only grant access to 200 workspaces an hour so this will take a while since I've built in a wait function

In [4]:
#get a list of all workspaces and load to a table in the lakehouse
response = fab_client.get(f"/v1/admin/workspaces")
df_workspaces = pd.json_normalize(response.json()['workspaces'])
#df_workspaces
df_workspaces = spark.createDataFrame(df_workspaces)

#creates a list of workspaces we want access to for dataset refresh, history, and workspace users
df_np_workspaces = df_workspaces \
    .filter(df_workspaces["type"] != "Personal")  \
    .filter(df_workspaces["type"] != "AdminWorkspace")


#df_np_workspaces.show()


StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 6, Finished, Available, Finished)

In [5]:
def _base_api(request, method="get", payload=None, headers=None):
    base_url = "https://api.powerbi.com"
    url = base_url + request

    token = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")

    if headers is None:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {token}"
        }

    if method.lower() == "get":
        response = requests.get(url, headers=headers)
    elif method.lower() == "post":
        response = requests.post(url, json=payload, headers=headers)
    else:
        raise ValueError(f"Unsupported method: {method}")

    response.raise_for_status()
    return response

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 7, Finished, Available, Finished)

In [6]:
rows_workspaces = df_np_workspaces.collect()
admin_api_limit = 200
admin_calls_made = 0
granted_workspaces = []
skipped_workspaces = []
payload = {
            "emailAddress": current_user,
            "groupUserAccessRight": "Admin",
            "principalType": "User",
            "identifier": current_user,
        }

def user_already_has_access(workspaceId):
    try:
        response = _base_api(
            request=f"/v1.0/myorg/groups/{workspaceId}/users",
            method="get"
        )
        users = response.json().get("value", [])
        return any(user.get("identifier", "").lower() == current_user.lower() for user in users)
    except HTTPError as e:
        if e.response.status_code == 429:
            retry_after = int(e.response.headers.get("Retry-After", 3600))
            print(f"⛔️ Hit 429 rate limit for CHECKING access. Sleeping for {retry_after} seconds...")
            time.sleep(retry_after)
            # Retry once after sleeping
            response = _base_api(
            request=f"/v1.0/myorg/groups/{workspaceId}/users",
            method="get"
            )

def process_workspace(row):
    global admin_calls_made
    workspaceId = row["id"]
    workspaceName = row["name"]

    try:
        if user_already_has_access(workspaceId):
            print(f"✅ Already has access to {workspaceName}")
            skipped_workspaces.append(workspaceName)
            return

        if admin_calls_made >= admin_api_limit:
            print("⏳ Reached admin API limit. Sleeping for 1 hour...")
            time.sleep(3600)
            admin_calls_made = 0

        try:
            _base_api(
                request=f"/v1.0/myorg/admin/groups/{workspaceId}/users",
                method="post",
                payload=payload
            )
            admin_calls_made += 1
            granted_workspaces.append(workspaceName)
            print(f"✅ Access granted to {workspaceName}")

        except HTTPError as e:
            if e.response.status_code == 429:
                retry_after = int(e.response.headers.get("Retry-After", 3600))
                print(f"⛔️ Hit 429 rate limit for GRANTING access. Sleeping for {retry_after} seconds...")
                time.sleep(retry_after)
                # Retry once after sleeping
                _base_api(
                    request=f"/v1.0/myorg/admin/groups/{workspaceId}/users",
                    method="post",
                    payload=payload
                )
                admin_calls_made += 1
                granted_workspaces.append(workspaceName)
                print(f"✅ Access granted to {workspaceName} after retry")
            else:
                raise e  # Re-raise for anything else

    except Exception as e:
        print(f"❌ Failed for {workspaceName}: {e}")

# Run with threads (adjust max_workers if needed)
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_workspace, row): row for row in rows_workspaces}
    for future in as_completed(futures):
        pass  # Output is handled in the function

print("✅ Done granting access!")
print(f"🔓 Granted access to {len(granted_workspaces)} workspaces.")
print(f"⏭️ Skipped {len(skipped_workspaces)} workspaces (already had access).")

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 8, Finished, Available, Finished)

✅ Already has access to DS.Adworks_L1
✅ Already has access to DEV,Adworks_L1
✅ Already has access to DEV.SALES KCON3_L3
✅ Already has access to QA.Sales_L1
✅ Already has access to Test Workspace
✅ Already has access to Happy Coding
✅ Already has access to DS.Adworks.L3
✅ Already has access to PW MSC Test on Data On Wheels
✅ Already has access to Happy Coding QA
✅ Already has access to Happy Coding Dev
✅ Already has access to Dragon Den Sandbox
✅ Already has access to Thr3e
✅ Already has access to Fabric of Middle-Earth
✅ Already has access to Microsoft Fabric Capacity Metrics 5/10/2025 4:27:20 PM
✅ Already has access to Analytics Sandbox
✅ Already has access to Serengeti End to End Workshop
✅ Already has access to Microsoft Fabric Capacity Metrics 8/8/2024, 4:20:18 PM
✅ Already has access to add_card_123_happy_coding_dev_kf
✅ Already has access to Black
✅ Already has access to Showdown
✅ Already has access to PASS Summit 2024
✅ Already has access to No Capacity Testing
✅ Already has ac

## Run Tasks
##### Using run multiple allows us to run the following notebooks in parallel and within the same spark session (makes it run much faster and uses less capacity)

If you need to run just one, comment out the other notebooks. 

In [7]:
# Build DAG String from metadata
df_nb_to_run = spark.createDataFrame(nb_to_run)

activities = []
[
    activities.append(
        {
            'name': m.path,
            'path': m.path,
            'timeoutPerCellInSeconds': 60000,
        }
    )
    for m in df_nb_to_run.collect()
]


DAG = {
    'activities': activities,
    'timeoutInSeconds': 60000, # Number of seconds allowed for all activities to complete before shutting down
    'concurrency': 20 # Max number of notebooks to run concurrently
}
print(DAG)

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 9, Finished, Available, Finished)

{'activities': [{'name': 'nb_run_workspaces', 'path': 'nb_run_workspaces', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_tenant_settings', 'path': 'nb_run_tenant_settings', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_capacities', 'path': 'nb_run_capacities', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_dashboards', 'path': 'nb_run_dashboards', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_dataflows', 'path': 'nb_run_dataflows', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_reports', 'path': 'nb_run_reports', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_datasets', 'path': 'nb_run_datasets', 'timeoutPerCellInSeconds': 60000}, {'name': 'nb_run_activities', 'path': 'nb_run_activities', 'timeoutPerCellInSeconds': 60000}], 'timeoutInSeconds': 60000, 'concurrency': 20}


In [8]:
try:
    output = notebookutils.notebook.runMultiple( DAG, {'displayDAGViaGraphviz': False} )
except RunMultipleFailedException as e:
    output = e.result


StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 10, Finished, Available, Finished)

In [9]:
%run nb_udfs

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 21, Finished, Available, Finished)

In [10]:
#syncing the sql endpoint to ensure that our reporting can get the latest version of the data

udf_SyncSqlEndpoint(workspace,lakehouse)

StatementMeta(, 7dc5695b-0d36-4bb4-87c5-f998094d9cfe, 22, Finished, Available, Finished)



