## Pull in UDFs

In [1]:
%run nb_udfs


StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 13, Finished, Available, Finished)

## Run Datasets

In [2]:
workspace = 'Admin%20Center' #have to escape the & symbol and spaces
lakehouse = 'lh_monitoring'

dataset_table = 'dimSemanticModels'
refreshHist_table = 'factRefreshHistory'
refreshSched_table = 'factRefreshSchedule'
source_table = 'factDatasetSources'
current_user = mssparkutils.env.getUserName()

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 14, Finished, Available, Finished)

### Dataset UDFs

In [3]:
def get_data_sources(row):
    workspaceId = row['workspaceId']
    workspaceName = row['workspaceName']
    datasetId = row['id']
    try:
        response = fabric.get_tmsl(datasetId,workspaceId)
    except Exception as e:
        print(f'Failed api call: {e}')
        raise e
    tmsl_data = json.loads(response)
    # Extract top-level metadata
    dataset_id = tmsl_data.get("id")
    created = tmsl_data.get("createdTimestamp")
    last_update = tmsl_data.get("lastUpdate")
    last_schema_update = tmsl_data.get("lastSchemaUpdate")

    # Prepare a list of rows
    rows = []
    for table in tmsl_data.get("model", {}).get("tables", []):
        table_name = table.get("name")
        is_hidden = table.get("isHidden")
        table_modified = table.get("modifiedTime")
        table_struct_modified = table.get("structureModifiedTime")

        for partition in table.get("partitions", []):
            partition_name = partition.get("name")
            partition_mode = partition.get("mode")
            partition_state = partition.get("state")
            partition_modified = partition.get("modifiedTime")

            source = partition.get("source", {})
            source_type = source.get("type")
            source_expression = source.get("expression")

            # Append the flattened structure
            rows.append((
                dataset_id,
                created,
                last_update,
                last_schema_update,
                table_name,
                is_hidden,
                table_modified,
                table_struct_modified,
                partition_name,
                partition_mode,
                partition_state,
                partition_modified,
                source_type,
                source_expression
            ))
    # Define schema
    schema = StructType([
        StructField("dataset_id", StringType(), True),
        StructField("createdTimestamp", StringType(), True),
        StructField("lastUpdate", StringType(), True),
        StructField("lastSchemaUpdate", StringType(), True),
        StructField("table_name", StringType(), True),
        StructField("table_isHidden", BooleanType(), True),
        StructField("table_modifiedTime", StringType(), True),
        StructField("table_structureModifiedTime", StringType(), True),
        StructField("partition_name", StringType(), True),
        StructField("partition_mode", StringType(), True),
        StructField("partition_state", StringType(), True),
        StructField("partition_modifiedTime", StringType(), True),
        StructField("source_type", StringType(), True),
        StructField("source_expression", StringType(), True)
    ])

    # Convert to DataFrame
    df_sources = spark.createDataFrame(rows,schema)
    df_sources = df_sources.withColumn("compositeKey", concat(df_sources["datasetId"], df_sources["partition_name"]))

    return df_sources



StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 15, Finished, Available, Finished)

In [4]:
def get_refresh_history(row):
    workspaceId = row['workspaceId']
    workspaceName = row['workspaceName']
    datasetId = row['id']
    try:
        response = _base_api(
            request=f"/v1.0/myorg/groups/{workspaceId}/datasets/{datasetId}/refreshes",
            method="get"
        )
    except Exception as e:
        print(f'Failed api call: {e}')
        raise e
    
    json_data = response.json()
    if 'value' in json_data and isinstance(json_data['value'], list) and len(json_data['value']) > 0:
        df_pd = pd.json_normalize(json_data['value'])
        df_spark = spark.createDataFrame(df_pd).drop("refreshAttempts")
        df_spark = df_spark.withColumn("datasetId", lit(datasetId))
        return df_spark
    return None  # No refresh history

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 16, Finished, Available, Finished)

In [5]:
def get_refresh_schedule(row):
    workspaceId = row['workspaceId']
    workspaceName = row['workspaceName']
    datasetId = row['id']

    try:
        response = _base_api(
            request=f"/v1.0/myorg/groups/{workspaceId}/datasets/{datasetId}/refreshSchedule",
            method="get"
        )
    except Exception as e:
        print(f'Failed api call: {e}')
        raise e

    
    json_data = response.json()

    if "days" in json_data and "times" in json_data and json_data["days"] and json_data["times"]:
        rows = [{"day": day, "time": time} for day in json_data["days"] for time in json_data["times"]]
        # Add metadata to each row
        for row in rows:
            row["enabled"] = json_data["enabled"]
            row["timeZone"] = json_data["localTimeZoneId"]
            row["notifyOption"] = json_data["notifyOption"]
        df_schedule = pd.DataFrame(rows)
        df_schedule_sp = spark.createDataFrame(df_schedule)
        df_schedule_sp = df_schedule_sp.withColumn("datasetId", lit(datasetId))
        df_schedule_sp = df_schedule_sp.withColumn("compositeKey", concat(df_schedule_sp["datasetId"], df_schedule_sp["day"], df_schedule_sp["time"]))
        return df_schedule_sp
    return None  # No refresh schedule

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 17, Finished, Available, Finished)

In [6]:
def process_dataset(row):
    workspaceId = row['workspaceId']
    workspaceName = row['workspaceName']
    datasetId = row['id']
    try:
        # Call both UDFs
        history_result = get_refresh_history(row)
        schedule_result = get_refresh_schedule(row)
        #sources_result = get_data_sources(row)

        # Return both results in a dictionary or tuple
        return {
            "datasetId": datasetId,
            "history": history_result,
            "schedule": schedule_result
            #"sources": sources_result
        }
    except Exception as e:
        raise Exception(f"Failed on dataset {datasetId} in workspace {workspaceName}: {e}")

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 18, Finished, Available, Finished)

In [7]:
def union_hist():
    global df_refresh_hist
    if df_refresh_hist_list:
        df_refresh_hist = union_batches(df_refresh_hist_list, batch_size=50)
        # the line below is significantly slower than using the union_batches function
        #reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), df_refresh_hist_list)
        print("✅ Refresh History dataframe ready")
    else:
        print("⚠️ No refresh history found.")

def union_schedule():
    global df_refresh_schedule
    if df_refresh_schedule_list:
        df_refresh_schedule = union_batches(df_refresh_schedule_list, batch_size=50)
        print("✅ Refresh Schedule dataframe ready")
    else:
        print("⚠️ No refresh schedule found.")

def union_sources():
    global df_sources
    if df_sources_list:
        df_sources = union_batches(df_sources_list, batch_size=50)
        print("✅ Data Sources dataframe ready")
    else:
        print("⚠️ No sources found.")


StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 19, Finished, Available, Finished)

In [8]:
def load_history():
    refreshHistPath = udf_GetFilePath(workspace, lakehouse, refreshHist_table)
    #print(refreshHistPath)
    naturalKeyColumnList = ['id']
    primaryKeyColumnName = "tableId"
    returnValHist = udf_UpsertDimension(df_refresh_hist,1,refreshHistPath,naturalKeyColumnList,primaryKeyColumnName,False)
    print(returnValHist)

def load_schedule():
    refreshSchedPath = udf_GetFilePath(workspace, lakehouse, refreshSched_table)
    #print(refreshSchedPath)
    naturalKeyColumnList = ['compositeKey']
    primaryKeyColumnName = "tableId"
    returnValSched = udf_UpsertDimension(df_refresh_schedule,1,refreshSchedPath,naturalKeyColumnList,primaryKeyColumnName,False)
    print(returnValSched)

def load_sources():
    refreshSourcePath = udf_GetFilePath(workspace, lakehouse, source_table)
    #print(refreshSourcePath)
    naturalKeyColumnList = ['compositeKey']
    primaryKeyColumnName = "tableId"
    returnValSched = udf_UpsertDimension(df_sources,1,refreshSourcePath,naturalKeyColumnList,primaryKeyColumnName,False)
    print(returnValSched)

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 20, Finished, Available, Finished)

### Load Dataset Tables

In [9]:
#get a list of all workspaces
response = fab_client.get(f"/v1/admin/workspaces")
df_workspaces = pd.json_normalize(response.json()['workspaces'])
#df_workspaces
df_workspaces = spark.createDataFrame(df_workspaces)

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 21, Finished, Available, Finished)

In [10]:
#get a list of all datasets and load to a table in the lakehouse
response = _base_api(
        request=f"/v1.0/myorg/admin/datasets",
        method="get"
    )
df_datasets = pd.json_normalize(response.json()['value'])
df_datasets = spark.createDataFrame(df_datasets)
df_datasets = df_datasets.drop("queryScaleOutSettings.autoSyncReadOnlyReplicas", "queryScaleOutSettings.maxReadOnlyReplicas","upstreamDatasets","users")
#df_datasets.printSchema()
#print(df_datasets.columns)

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 22, Finished, Available, Finished)

In [11]:
#creates a slowly changing dimension so we can keep an eye on any deleted datasets
datasetPath = udf_GetFilePath(workspace, lakehouse, dataset_table)
print(datasetPath)
naturalKeyColumnList = ['id']
primaryKeyColumnName = "tableId"
returnVal = udf_UpsertDimension(df_datasets,2,datasetPath,naturalKeyColumnList,primaryKeyColumnName,True)
print(returnVal)

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 23, Finished, Available, Finished)

abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimSemanticModels
✅ Upsert complete
{'startTime': '2025-04-29 20:06:15.917552', 'stopTime': '2025-04-29 20:06:45.042898', 'details': '0 records updated, 1 records inserted from 7760 staging rows to abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/dimSemanticModels'}


In [12]:
#creates a list of workspaces we want refresh history and schedules for
df_np_workspaces = df_workspaces \
    .filter(df_workspaces["type"] == "Workspace") \
    .filter(df_workspaces["state"] == "Active") \
    .withColumnRenamed("id","workspaceOgId") \
    .withColumnRenamed("name","workspaceName")
    #don't need refresh history/schedules for personal workspaces and not for the Admin Monitoring workspace
df_datasets_refreshable = df_datasets.filter(df_datasets["isRefreshable"] ==1)
df_np_datasets = df_np_workspaces.join(df_datasets_refreshable, df_np_workspaces["workspaceOgId"] == df_datasets_refreshable["workspaceId"], how="inner")
df_np_datasets = df_np_datasets.select("workspaceId","workspaceName","id")
#df_np_datasets.head(5)
#df_dist_workspaces = df_np_datasets.select("workspaceName","workspaceId").distinct()
print(f'datasets: {df_np_datasets.count()}')
#df_dist_workspaces.show()
#df_dist_workspaces.count()

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 24, Finished, Available, Finished)

datasets: 3515


In [13]:
#### loop over datasets and get the refresh histories and schedules #### 
# not using the admin capacity rest api because we want to grab all refresh histories that would impact the gateways for proper load balancing. 

# use this for only grabbing the refresh history for one workspace
# workspaceId = 'b0e6b127-399e-40c0-8a07-6b50fce502ff'
# df_np_datasets = df_np_datasets.filter(df_np_datasets["workspaceId"] == workspaceId)


# Collect rows
rows = df_np_datasets.collect()
# This will hold all successful Spark DataFrames to union later
df_refresh_hist_list = []
df_refresh_schedule_list = []
df_sources_list = []
combined_results = []

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 25, Finished, Available, Finished)

In [14]:
# Use thread pool to run API calls in parallel
# only runs 10 seconds faster to increase the max_workers to 30 from 10
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit each row (dataset) to the thread pool for processing
    # 'get_refresh_history' is the function each thread will run
    # 'future_to_row' maps each Future object back to the original row for error tracking
    future_to_row = {
        executor.submit(process_dataset, row): row for row in rows
    }
    # Iterate through each completed thread result as they finish (not in original order)
    for i, future in enumerate(as_completed(future_to_row), 1):
        try:
            # Get the result returned from the thread
            result = future.result()
            # If the function returned a non-null Spark DataFrame, add it to the list
            if result["history"] is not None:
                df_refresh_hist_list.append(result["history"])
            if result["schedule"] is not None:
                df_refresh_schedule_list.append(result["schedule"])  
            #if result["sources"] is not None:
            #    df_sources_list.append(result["sources"]) 
        except Exception as e:
            # If the thread raised an exception, fetch the original row and report the error
            row = future_to_row[future]
            print(f"❌ Failed for dataset {row['id']} in workspace {row['workspaceName']}: {e}")
        # Print progress every 500 datasets processed
        if i % 500 == 0:
            print(f"✅ Processed {i} datasets...")
            
# Once all threads are done, print a completion message
print('✅ Looping complete!')

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 26, Finished, Available, Finished)

✅ Processed 500 datasets...
✅ Processed 1000 datasets...
✅ Processed 1500 datasets...
✅ Processed 2000 datasets...
✅ Processed 2500 datasets...
✅ Processed 3000 datasets...
✅ Processed 3500 datasets...
✅ Looping complete!


In [15]:

# Union all non-empty DataFrames
df_refresh_hist = None
df_refresh_schedule = None
#df_sources = None

# Run both unions in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(union_hist)
    executor.submit(union_schedule)
    #executor.submit(union_sources)

#df_refresh_hist.show(5, truncate=False)
#df_refresh_schedule.show(5, truncate=False)



StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 27, Finished, Available, Finished)

✅ Refresh History dataframe ready
✅ Refresh Schedule dataframe ready


In [16]:
# Run both loading into the lakehouse in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(load_history)
    executor.submit(load_schedule)
    #executor.submit(load_sources)

StatementMeta(, e2148dbf-e2bf-407c-a7cb-a3ea8e7dc983, 28, Finished, Available, Finished)

📦 Initial load complete
{'startTime': '2025-04-29 20:11:45.593370', 'stopTime': '2025-04-29 20:13:32.760945', 'details': '0 records updated, 11336 records inserted from 11336 staging rows to abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/factRefreshHistory'}
📦 Initial load complete
{'startTime': '2025-04-29 20:11:45.409592', 'stopTime': '2025-04-29 20:16:06.129288', 'details': '0 records updated, 18392 records inserted from 18392 staging rows to abfss://b0e6b127-399e-40c0-8a07-6b50fce502ff@onelake.dfs.fabric.microsoft.com/e055fa9a-d676-4689-b18e-be950d62d16f/Tables/factRefreshSchedule'}
