In [None]:
# Libraries # 
import json, requests, pandas as pd 
from datetime import datetime as dt, timedelta
from pyspark.sql import SparkSession, Row, Window, functions as funcast
from pyspark.sql.functions import *
from pyspark.sql.types import *
from requests.exceptions import HTTPError
from dateutil.parser import parse as dtparser
from pyspark.sql.types import StringType

import sempy.fabric as fabric 
from notebookutils import notebook  
from notebookutils.mssparkutils.handlers.notebookHandler import RunMultipleFailedException
import notebookutils.mssparkutils as mssparkutils

from delta import *
import ast
import notebookutils
from functools import reduce
from random import randint
from time import sleep
from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundException
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Not importing sempy_labs because we can't run %pip install from a run multiple notebook scenario. 
# recreated the calls below in udfs

#try:
#    import sempy_labs as labs
#except:
#    %pip install semantic-link-labs
#    import sempy_labs as labs
#%pip install semantic-link-labs

#import sempy_labs as labs
#
#from sempy_labs import migration, directlake, admin, graph, lakehouse as lake, report as rep
#from sempy_labs.admin import _activities
#from sempy_labs.tom import connect_semantic_model
#from sempy_labs.report import ReportWrapper
#import sempy_labs._icons as icons
#from sempy_labs._helper_functions import (
#    resolve_workspace_name_and_id,
#    resolve_capacity_id,
#    _base_api,
#    _create_dataframe
#)
#from sempy_labs.admin import (_activities, list_activity_events, _tenant)
#from sempy_labs._refresh_semantic_model import (get_semantic_model_refresh_history)



# Configuration
spark.conf.set('spark.sql.parquet.int96RebaseModeInWrite','LEGACY')
spark.conf.set('spark.sql.parquet.int96RebaseModeInRead','LEGACY')
spark.conf.set('spark.sql.parquet.datetimeRebaseModeInRead','LEGACY')


fab_client = fabric.FabricRestClient()
pbi_client = fabric.PowerBIRestClient()




#### Sempy Library UDFs (no pip install needed)

In [None]:
# sempy library UDFs


def generate_hourly_windows(date_str):
    windows = []
    start = dt.strptime(date_str, "%Y-%m-%d")
    for i in range(24):
        hour_start = start + timedelta(hours=i)
        hour_end = hour_start + timedelta(hours=1) - timedelta(milliseconds=1)

        start_str = hour_start.strftime("%Y-%m-%dT%H:%M:%S.000Z")
        end_str = hour_end.strftime("%Y-%m-%dT%H:%M:%S.999Z")

        windows.append((start_str, end_str))
    return windows


def resolve_workspace_name_and_id(workspace_name, df_workspaces):
    """
    Returns a tuple of (workspaceName, workspaceId) from a known workspace list.
    """
    row = df_workspaces.filter(df_workspaces["name"] == workspace_name).first()
    if row:
        return row["name"], row["id"]
    else:
        raise ValueError(f"Workspace '{workspace_name}' not found.")

def pagination(response, headers):
    responses = []
    
    try:
        response_json = response.json()
    except Exception as e:
        print(f"⚠️ Failed to parse JSON for URL: {response.url}")
        print(f"🔍 Status code: {response.status_code}")
        print(f"📄 Raw response text: {response.text[:300]}")  # limit output
        raise e  # re-raise or handle gracefully

    responses.append(response_json)

    # Follow @odata.nextLink for pagination
    while '@odata.nextLink' in response_json:
        next_url = response_json['@odata.nextLink']
        response = requests.get(next_url, headers=headers)
        try:
            response_json = response.json()
            responses.append(response_json)
        except Exception as e:
            print(f"⚠️ Pagination failed at {next_url}")
            print(f"🔍 Status code: {response.status_code}")
            print(f"📄 Raw response: {response.text[:300]}")
            break  # or raise, depending on preference

    return responses
'''
def pagination(response,headers):

    responses = []
    response_json = response.json()
    responses.append(response_json)

    # Check for pagination
    continuation_token = response_json.get("continuationToken")
    continuation_uri = response_json.get("continuationUri")

    # Loop to handle pagination
    while continuation_token is not None:
        response = requests.get(continuation_uri, headers=headers)
        response_json = response.json()
        responses.append(response_json)

        # Update the continuation token and URI for the next iteration
        continuation_token = response_json.get("continuationToken")
        continuation_uri = response_json.get("continuationUri")

    return responses
'''

def _base_api(request, method="get", payload=None, headers=None, uses_pagination=False):
    base_url = "https://api.powerbi.com"
    url = base_url + request

    token = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")

    if headers is None:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {token}"
        }

    if method.lower() == "get":
        response = requests.get(url, headers=headers)
        if uses_pagination:
            responses = pagination(response,headers)
            return responses
        else:
            return response
    elif method.lower() == "post":
        response = requests.post(url, json=payload, headers=headers)
    else:
        raise ValueError(f"Unsupported method: {method}")

    response.raise_for_status()
    return response


'''
def list_activity_events(start_date, end_date):
    request = f"/v1.0/myorg/admin/activityevents?startDateTime={start_date}&endDateTime={end_date}"
    response = _base_api(request, method="get")
    return response.json().get("value", [])
'''

def _create_dataframe(columns: dict) -> pd.DataFrame:
    return pd.DataFrame(columns=list(columns.keys()))

def _update_dataframe_datatypes(dataframe: pd.DataFrame, column_map: dict):
    """
    Updates the datatypes of columns in a pandas dataframe based on a column map.

    Example:
    {
        "Order": "int",
        "Public": "bool",
    }
    """

    for column, data_type in column_map.items():
        if column in dataframe.columns:
            if data_type == "int":
                dataframe[column] = dataframe[column].astype(int)
            elif data_type == "bool":
                dataframe[column] = dataframe[column].astype(bool)
            elif data_type == "float":
                dataframe[column] = dataframe[column].astype(float)
            elif data_type == "datetime":
                dataframe[column] = pd.to_datetime(dataframe[column])
            # This is for a special case in admin.list_reports where datetime itself does not work. Coerce fixes the issue.
            elif data_type == "datetime_coerce":
                dataframe[column] = pd.to_datetime(dataframe[column], errors="coerce")
            # This is for list_synonyms since the weight column is float and can have NaN values.
            elif data_type == "float_fillna":
                dataframe[column] = dataframe[column].fillna(0).astype(float)
            # This is to avoid NaN values in integer columns (for delta analyzer)
            elif data_type == "int_fillna":
                dataframe[column] = dataframe[column].fillna(0).astype(int)
            elif data_type in ["str", "string"]:
                dataframe[column] = dataframe[column].astype(str)
            else:
                raise NotImplementedError

def list_activity_events(
    start_time: str,
    end_time: str,
    activity_filter: Optional[str] = None,
    user_id_filter: Optional[str] = None,
    return_dataframe: bool = True,
) -> pd.DataFrame:
    start_dt = dtparser(start_time)
    end_dt = dtparser(end_time)

    if not start_dt.date() == end_dt.date():
        raise ValueError(f"{icons.red_dot} Start and End Times must be within the same UTC day.")

    columns = {
        "Id": "string", "Record Type": "string", "Creation Time": "datetime",
        "Operation": "string", "Organization Id": "string", "User Type": "string",
        "User Key": "string", "Workload": "string", "Result Status": "string",
        "User Id": "string", "Client IP": "string", "User Agent": "string",
        "Activity": "string", "Workspace Name": "string", "Workspace Id": "string",
        "Object Id": "string", "Request Id": "string", "Object Type": "string",
        "Object Display Name": "string", "Experience": "string",
        "Refresh Enforcement Policy": "string", "Is Success": "bool",
        "Activity Id": "string", "Item Name": "string", "Dataset Name": "string",
        "Report Name": "string", "Capacity Id": "string", "Capacity Name": "string",
        "App Name": "string", "Dataset Id": "string", "Report Id": "string",
        "Artifact Id": "string", "Artifact Name": "string", "Report Type": "string",
        "App Report Id": "string", "Distribution Method": "string",
        "Consumption Method": "string", "Artifact Kind": "string",
    }

    df = _create_dataframe(columns=columns)
    url = f"/v1.0/myorg/admin/activityevents?startDateTime='{start_time}'&endDateTime='{end_time}'"

    filters = []
    if activity_filter:
        filters.append(f"Activity eq '{activity_filter}'")
    if user_id_filter:
        filters.append(f"UserId eq '{user_id_filter}'")
    if filters:
        url += f"&$filter={' and '.join(filters)}"

    try:
        responses = _base_api(request=url, uses_pagination=True)
    except Exception as e:
        print(f"⚠️ Error in API call for {start_time}–{end_time}: {e}")
        return df  # empty DataFrame fallback

    rows = []
    for response in responses:
        for event in response.get("activityEventEntities", []):
            rows.append({k: event.get(k.replace(" ", "")) for k in columns})

    if rows:
        df = pd.DataFrame(rows)
        _update_dataframe_datatypes(df, columns)

    return df





def get_semantic_model_refresh_history(workspace_id, dataset_id):
    request = f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/refreshes"
    response = _base_api(request, method="get")
    return response.json().get("value", [])


class icons:
    green_dot = "🟢"
    red_dot = "🔴"
    yellow_dot = "🟡"
    check_mark = "✅"

#### Functions

In [None]:
def grant_admin_access(workspaceId, current_user):
    payload = {
            "emailAddress": current_user,
            "groupUserAccessRight": "Admin",
            "principalType": "User",
            "identifier": current_user,
        }
    #grant access
    _base_api(
        request=f"/v1.0/myorg/admin/groups/{workspaceId}/users",
        method="post",
        payload=payload,
    )
    return workspaceId 

In [3]:
def union_batches(df_list, batch_size=50):
    """
    Efficiently unions a large list of Spark DataFrames in batches.
    This avoids deep execution plans caused by chaining thousands of .unionByName() calls.
    
    Parameters:
        df_list (list): List of Spark DataFrames to union.
        batch_size (int): Number of DataFrames to union at a time.

    Returns:
        Spark DataFrame: A single DataFrame resulting from the union of all inputs.
    """
    if not df_list:
        return None

    while len(df_list) > 1:
        batched = []
        for i in range(0, len(df_list), batch_size):
            batch = df_list[i:i + batch_size]
            batch_union = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), batch)
            batched.append(batch_union)
        df_list = batched  # Now reduce over fewer, bigger DataFrames
    return df_list[0]

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 11, Finished, Available, Finished)

In [4]:
def get_workspace_items(workspace=None):
    df_items = None
    if workspace is None:
        workspaceId = fabric.get_workspace_id() #gets this workspace by default
    else:
        try:
            workspaceId = fabric.resolve_workspace_id(workspace)
            response = fab_client.get(f"/v1/workspaces/{workspaceId}/items")

            # Check the status code of the response for this endpoint
            # Use 200 if operation is completed, 201 if item is created
            if response.status_code != 200:
                raise FabricHTTPException(response)

            df_items = pd.json_normalize(response.json()['value'])
        except WorkspaceNotFoundException as e:
            print("Caught a WorkspaceNotFoundException:", e)
        except FabricHTTPException as e:
            print("Caught a FabricHTTPException. Check the API endpoint, authentication.")
    return df_items
   

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 12, Finished, Available, Finished)

In [5]:
def replace_nulls_in_array(arr, default="UNKNOWN"):
    if arr is None:
        return arr
    return [x if x is not None else default for x in arr]

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 13, Finished, Available, Finished)

In [6]:
# metadata functions

def udf_AddHashKeyColumn (df, naturalKeyColumnList) :
    
    # compile list of columns to include in hash, and exclude the natural key(s) of the table
    hashColumns = [c for c in df.columns if c not in naturalKeyColumnList]
    
    # Ensure all hash columns are converted to string, replace nulls with 'NA'
    exprs = [
        when(col(c).isNotNull(), col(c).cast(StringType()))
        .otherwise(lit("NA"))
        for c in hashColumns
    ]

    # Generate the hash column using SHA2 over a pipe-delimited string
    hash_column = sha2(concat_ws("|", *exprs), 256)

    # Return original DataFrame with new hash column
    return df.select("*", hash_column.alias("ETLHashKey"))
    
    
def udf_AddSCD2Columns (df) :

    # add Start/End datetime columns, IsCurrent flag
    return df.select(
                "*"
                ,lit("1900-01-01").cast("timestamp").alias("StartDateTime")
                ,lit("9999-12-31").cast("timestamp").alias("EndDateTime")
                ,lit(True).alias("IsCurrent")
                )

def udf_AddModifiedDateColumn (df) :

    # add  modified datetime column
    return df.select(
                "*"
                ,current_timestamp().alias("ETLModifiedDateTime")
                )

def udf_AddPrimaryKey (df, targetPath, primaryKeyColumnName="ID") :

    # default the max existing ID to 0 (in the event of an initial load)
    maxID = 0

    # find the largest ID if the table exists  
    if notebookutils.fs.exists(targetPath):
        maxID = spark.read.format("delta").load( targetPath ).agg({primaryKeyColumnName:"max"}).collect()[0][0]

    return df.select((maxID + row_number().over(Window.orderBy(lit(None)))).alias(primaryKeyColumnName)
            ,"*"
            )
                    

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 14, Finished, Available, Finished)

In [7]:
def udf_LoadTableInitial (df, targetPath) :

    # write data to gold lakehouse
    (
        df
            .write
            .format( 'delta' )
            .mode( 'overwrite' )
            .option('delta.columnMapping.mode', 'name')
            .option( 'overwriteSchema', 'True' )
            .save( targetPath )
    )

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 15, Finished, Available, Finished)

In [8]:
# file path function
def udf_GetFilePath (workspace, lakehouse, table):
    workspaceID = fabric.list_workspaces( f"name eq '{workspace}'")['Id'][0]
    lakehousePath = notebookutils.lakehouse.get( name = lakehouse, workspaceId = workspaceID )['properties']['abfsPath']
    return f'{lakehousePath}/Tables/{table}'

StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 16, Finished, Available, Finished)

In [9]:
def udf_UpsertDimension(df, dimensionType, targetPath, naturalKeyColumnList, primaryKeyColumnName="ID", flagSoftDeletes=False):
    from datetime import datetime as dt
    startTime = dt.now()

    staging_rows_added = None
    if not df.rdd.isEmpty():
        staging_rows_added = df.count()

    dfTargetWhereClause = "1=1"
    mergeCondition = "1=1 AND "
    updateSetClause = {}  # init outside if blocks

    if dimensionType == 2:
        dfTargetWhereClause = "IsCurrent = 1"
        mergeCondition = f"source.`{primaryKeyColumnName}` IS NULL AND target.IsCurrent = 1 AND "
        updateSetClause = {
            "EndDateTime": "source.EndDateTime",
            "IsCurrent": "source.IsCurrent",
            "ETLModifiedDateTime": "source.ETLModifiedDateTime"
        }

    # Soft deletes optimization — avoid unnecessary joins
    if flagSoftDeletes and notebookutils.fs.exists(targetPath):
        df = df.withColumn("IsDeleted", lit(False))

        dfTarget = (
            spark.read.format("delta").load(targetPath)
            .select(*naturalKeyColumnList)
            .where(dfTargetWhereClause)
        )

        deletes = (
            dfTarget.alias("target")
            .join(df.alias("source"), naturalKeyColumnList, "left_anti")
            .withColumn("IsDeleted", lit(True))
        )

        df = df.unionByName(deletes, allowMissingColumns=True)

    # Add columns
    df = udf_AddHashKeyColumn(df, naturalKeyColumnList)
    if dimensionType == 2:
        df = udf_AddSCD2Columns(df)
    df = udf_AddModifiedDateColumn(df)

    # Upsert logic
    if notebookutils.fs.exists(targetPath):
        targetDelta = DeltaTable.forPath(spark, targetPath)

        # Selecting only relevant columns
        dfTarget = (
            spark.read.format("delta").load(targetPath)
            .select(
                *naturalKeyColumnList,
                col("ETLHashKey").alias("targetHashKey"),
                col(primaryKeyColumnName).alias("targetID")
            )
            .where(dfTargetWhereClause)
        ).cache()

        # Use cache to prevent recomputation
        df = df.cache()

        updates = (
            df.alias("source")
            .join(dfTarget.alias("target"), naturalKeyColumnList, "inner")
            .where("source.ETLHashKey != target.targetHashKey")
        )

        if dimensionType == 2:
            expires = (
                updates
                .withColumn("IsCurrent", lit(False))
                .withColumn("EndDateTime", current_timestamp())
            )

        inserts = (
            df.alias("source")
            .join(dfTarget.alias("target"), naturalKeyColumnList, "left")
            .where("target.targetHashKey IS NULL")
        )

        if dimensionType == 2:
            inserts = inserts.union(updates.withColumn("StartDateTime", current_timestamp()))
            updates = expires

        inserts = udf_AddPrimaryKey(inserts, targetPath, primaryKeyColumnName)

        mergeCondition += " AND ".join([f"target.`{c}` = source.`{c}`" for c in naturalKeyColumnList])

        # Avoid unnecessary count()s
        insertsCount = None
        updatesCount = None
        if not inserts.rdd.isEmpty():
            insertsCount = inserts.count()
        if not updates.rdd.isEmpty():
            updatesCount = updates.count()

        stageChanges = (
            inserts
            .union(
                updates.select(
                    lit(None).cast(IntegerType()).alias(primaryKeyColumnName), "*"
                )
            )
            .drop("targetHashKey", "targetID")
        )

        if dimensionType == 1:
            updateSetClause = {
                f"`{c}`": f"source.`{c}`" for c in stageChanges.columns if c != primaryKeyColumnName
            }

        # Optimization 5: Use one merge call, or optionally split
        (
            targetDelta.alias("target").merge(
                source=stageChanges.alias("source"),
                condition=mergeCondition
            )
            .whenMatchedUpdate(set=updateSetClause)
            .whenNotMatchedInsertAll()
            .execute()
        )

        print("✅ Upsert complete")

    else:
        df = udf_AddPrimaryKey(df, targetPath, primaryKeyColumnName)
        udf_LoadTableInitial(df, targetPath)
        insertsCount = df.count()
        updatesCount = 0
        print("📦 Initial load complete")

    stopTime = dt.now()
    details = f'{updatesCount or 0} records updated, {insertsCount or 0} records inserted from {staging_rows_added or "?"} staging rows to {targetPath}'

    return {
        "startTime": str(startTime),
        "stopTime": str(stopTime),
        "details": details
    }





StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 17, Finished, Available, Finished)

In [10]:
# Function to sync SQL endpoint with the Lakehouse or Warehouse
def udf_SyncSqlEndpoint(workspace, lakehouse):
    logs = []
    try:
        # Fetch SQL endpoint properties

        workspaceID = fabric.list_workspaces( f"name eq '{workspace}'")['Id'][0]
        lakehouseID = notebookutils.lakehouse.get( name = lakehouse, workspaceId = workspaceID )['id']

        client = fabric.FabricRestClient()
        lakehouse_info = client.get(f"/v1/workspaces/{workspaceID}/lakehouses/{lakehouseID}").json()
        sql_endpoint_id = lakehouse_info['properties']['sqlEndpointProperties']['id']
        
        # Set URI for the API call
        uri = f"/v1.0/myorg/lhdatamarts/{sql_endpoint_id}"
        payload = {"commands": [{"$type": "MetadataRefreshExternalCommand"}]}
        
        # Call REST API to initiate the sync
        response = client.post(uri, json=payload)
        if response.status_code != 200:
            logs.append(f"Error initiating sync: {response.status_code} - {response.text}")
            return
        
        data = json.loads(response.text)

        batch_id = data["batchId"]
        progress_state = data["progressState"]

        # URL for checking the sync status
        status_uri = f"/v1.0/myorg/lhdatamarts/{sql_endpoint_id}/batches/{batch_id}"
        
        # Polling until the sync is complete
        while progress_state == 'inProgress':
            time.sleep(1)  # Polling interval
            status_response = client.get(status_uri)
            status_data = status_response.json()
            progress_state = status_data["progressState"]
        
        # Check if the sync completed successfully
        if progress_state == 'success':
            table_details = [
                {
                    'tableName': table['tableName'],
                    'lastSuccessfulUpdate': table.get('lastSuccessfulUpdate', 'N/A'),
                    'tableSyncState': table['tableSyncState'],
                    'sqlSyncState': table['sqlSyncState'],
                    'warningMessages': table['warningMessages']
                }
                for table in status_data['operationInformation'][0]['progressDetail']['tablesSyncStatus']
            ]
            
            # Print extracted table details
            for detail in table_details:
                print(f"Table: {detail['tableName']}   Last Update: {detail['lastSuccessfulUpdate']}  "
                      f"Table Sync State: {detail['tableSyncState']}  SQL Sync State: {detail['sqlSyncState']}   "
                      f"Table Warnings: {detail['warningMessages']}")
                logs.append(f"Table: {detail['tableName']}   Last Update: {detail['lastSuccessfulUpdate']}  "
                      f"Table Sync State: {detail['tableSyncState']}  SQL Sync State: {detail['sqlSyncState']}   "
                      f"Table Warnings: {detail['warningMessages']}")
            #uncomment if you need to see all the details
            #display(status_data)
            
        # Handle failure
        elif progress_state == 'failure':
            logs.append(f"Sync failed: {status_data}")
    
    except FabricHTTPException as fe:
        logs.append(f"Fabric HTTP Exception: {fe}")
    except WorkspaceNotFoundException as we:
        logs.append(f"Workspace not found: {we}")
    except Exception as e:
        logs.append(f"An unexpected error occurred: {e}")
    return(logs)



StatementMeta(, 1095e3f8-0ab6-4152-80b2-f37dd4dcbb47, 18, Finished, Available, Finished)