In [1]:
# %pip install semantic-link-labs
# %pip install semantic-link-sempy
# %pip install pyspark
# %pip install semantic-link

StatementMeta(, b4b68dab-9ff3-4559-8db9-c426d9320ace, 3, Finished, Available, Finished)

In [1]:
from pyspark.sql import SparkSession
import json
import pandas as pd
import requests
import concurrent.futures
import time
from pyspark.sql.functions import col
import msal


StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 3, Finished, Available, Finished)

In [None]:

# Azure AD Credentials (Replace with your values)
TENANT_ID = ""
CLIENT_ID = ""
CLIENT_SECRET = ""

# Azure AD Authority & Scope
# AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
# SCOPE = ["https://api.fabric.microsoft.com/.default"]  # Fabric API Scope
# TOKEN_URL = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"

AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
SCOPE = ["https://analysis.windows.net/powerbi/api/.default"]  # Power BI Scpoe
TOKEN_URL = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/token"



def get_access_token():
    """Fetch an access token from Azure AD for Microsoft Fabric API."""
    app = msal.ConfidentialClientApplication(CLIENT_ID, authority=AUTHORITY, client_credential=CLIENT_SECRET)
    token_response = app.acquire_token_for_client(scopes=SCOPE)

    if "access_token" in token_response:
        return token_response["access_token"]
    else:
        raise Exception(f"Failed to get token: {token_response}")




StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 4, Finished, Available, Finished)

In [3]:
#List of all URLs
#List of Capacities URL
CAPACITIES_API_URL = "https://api.powerbi.com/v1.0/myorg/admin/capacities"

#List of Workspaces
WORKSPACES_API_URL = "https://api.fabric.microsoft.com/v1/admin/workspaces"

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 5, Finished, Available, Finished)

In [4]:
#List of Capacities
def get_fabric_capacities():
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    
    response = requests.get(CAPACITIES_API_URL, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

# Run the API request and print the result
capacities = get_fabric_capacities()
capacities_df = pd.DataFrame(capacities["value"])
spark_capacities_df = spark.createDataFrame(capacities_df)
spark_capacities_df = spark_capacities_df.drop("users")
spark_capacities_df.write.mode("overwrite").format("delta").saveAsTable("fabric_capacities")

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 6, Finished, Available, Finished)

In [5]:
#List of Workspaces
def get_workspaces():
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    
    response = requests.get(WORKSPACES_API_URL, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

# Run the API request and print the result
workpsaces = get_workspaces()
workpsaces_df = spark.createDataFrame(workpsaces["workspaces"])
workpsaces_df.write.mode("overwrite").format("delta").saveAsTable("fabric_workpsaces")

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 7, Finished, Available, Finished)

In [6]:
WorkspaceIDs = workpsaces_df.toPandas()["id"].tolist()

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 8, Finished, Available, Finished)

In [7]:
#List of datasets in  a Workspace
def get_datasets():
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    
    response = requests.get("https://api.powerbi.com/v1.0/myorg/admin/datasets", headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

# Run the API request and print the result
# for workspaceid in WorkspaceIDs:
datasets = get_datasets()
datasets_df = pd.DataFrame(datasets["value"])
datasets_df.drop(columns=["users","upstreamDatasets"], inplace=True)
spark_datasets_df = spark.createDataFrame(datasets_df)
spark_datasets_df.write.mode("overwrite").format("delta").saveAsTable("datasets")

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 9, Finished, Available, Finished)

  A field of type StructType expects a pandas.DataFrame, but got: <class 'pandas.core.series.Series'>
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


In [8]:
DatasetIDs = spark_datasets_df.toPandas()["id"].tolist()

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 10, Finished, Available, Finished)

In [9]:
#Custom Workspace Filtering
workspace_ids = [
    "8ba77fd7-a1ef-4837-8f15-915ba50d0f4a",
    "02fd1e90-3ba6-48b5-9f9c-15922ad12258",
    "0453cc18-6202-4445-b41a-4036e6719b08",
    "092e35cf-d04f-4ebb-8d9e-2cc12e736e4e",
    "0a8537f9-f679-4c65-b166-2db0c4fdcd8b",
    "0b0797d1-b0ef-42d7-a1bb-9b97703df039",
    "0c080ff6-45f2-491b-8d07-886762bb187c",
    "0c7b90de-5814-48a6-a494-9036b163c122",
    "0e396047-c341-40df-a788-73fc8e80cf45",
    "11272a63-7dba-4df8-8d67-3db6f36bc1cb",
    "1380020f-75a6-4e54-98a7-0e212c6f8bc1",
    "17ad07cb-415a-4bd8-8a62-b99413535930",
    "1d5f520f-b031-4661-95fe-0f54fdd179ee",
    "217cc489-d70a-42dc-acd2-e212d66e7b40",
    "2229266c-6132-463a-8232-fdcb591f2be7",
    "228db8e8-6203-4ba8-b406-7b1da29c34c1",
    "245909ee-2edf-459e-a771-bd1e4e83d2a0",
    "249abea1-ac3b-47e8-8106-9e7953840573",
    "28a728f4-1333-4958-991a-2ffc1b576a39",
    "2c9573cd-be00-4436-80ef-77701fbe4792",
    "35060111-ead1-48c7-90ad-6470f10fc59e",
    "3ba00f7a-79ef-430f-b005-5abdb305587a",
    "3cff07f4-9d8d-46fa-9ec5-d97c5c530ccb",
    "4047ca4c-c14c-4a9f-98c9-f8a23f199156",
    "45883d13-9597-49e3-9565-8fdf3af7a2ad",
    "4a190a28-7188-4726-875d-f7aad0b21e64",
    "4bc02dfe-903e-445c-b724-3b974bcd0658",
    "4d84457b-1559-4326-b099-a74e85a50bc5",
    "577aefa8-dd74-403d-ad5b-95800a35ddbf",
    "59c26bbc-0ed0-4b1f-ab1a-4e3f22c31fba",
    "5b855151-6d50-4472-954d-038388191b67",
    "5d773d9e-6b75-4e29-bd30-505bfa21f940",
    "5f25fdcb-083e-4da3-80b5-fa6f1c35d537",
    "6952b9a8-6540-417d-ab91-fbb975b9fca6",
    "696ad86b-1d99-4114-b703-493b709b062f",
    "6c1bc232-514f-4419-a60e-6f84d7b98b34",
    "702cdb35-1a7d-48ec-b829-cbc644497067",
    "70f7f773-cbcf-4184-b9cf-5341694a95b0",
    "7520754f-4eb6-4c5d-831c-d7c84aa9355f",
    "762cdec9-ca96-4a30-a5c3-c94a3a4a8e4a",
    "7707be4b-7753-432e-b7f6-c94aaccc7e97",
    "778c431b-4111-4580-a46f-8fc7871dc3bc",
    "77bd7035-0b65-4a7b-85bb-10d833e66e6f",
    "7fbb6ab7-dbe4-4ebb-a2da-52537d629251",
    "81cef3f8-802a-4afb-85db-e60385fd685a",
    "831affc2-2227-42e3-ac1d-a7a2dd1e9756",
    "83ec31a1-2ca3-46fe-a5f8-2bcc8e70f8cd",
    "86125406-c2ed-4d96-b0a6-f0c209165fb0",
    "86805504-a5dd-46d6-ade7-42bc35d2b4f2",
    "8a14eea3-d016-4012-aad4-bd691f10219b",
    "8a542b19-d8fd-44da-8738-9ac2262d6ee1",
    "8b1d9eb6-0b5a-440c-9eb7-b0139459edf6",
    "8ba77fd7-a1ef-4837-8f15-915ba50d0f4a",
    "933590a7-3e78-42d0-aed7-a630866b8ee8",
    "a912e8a5-7138-4452-88a1-5c7df2c99a5a",
    "ac5b2892-2977-4250-8d3e-34485277662e",
    "ac8a466e-cbd2-4c73-9b6e-3b18cded7ee8",
    "b548baa6-d999-490a-9b64-b5b71d369f72",
    "b84686b4-27a1-469e-8763-078e00cb7e7a",
    "b87ed813-90b4-4e47-8db9-5f4244e3df32",
    "bbcb70ff-d86a-4680-9edc-cbbfa0e63129",
    "bc4e98a1-e23e-4608-acee-d7e2eedc0272",
    "c01086da-46e1-483e-b7d3-555d3ca1aa13",
    "c2cce67a-a8ec-46c5-8559-ac563f46bad1",
    "c59c58ad-674c-49be-9eba-912b80463d1b",
    "c936d43b-ab4c-47a4-b331-41fce01ea0e2",
    "cdb45b81-d656-4b26-a53e-fb0f5fc8d6cb",
    "d000ffaf-5870-46bf-826e-e451614916ea",
    "d02d8e4f-94c7-4f47-bc21-f88f884504e6",
    "d2b2d8d9-dff4-461a-b027-8b2a019c849c",
    "d83f57eb-7845-403d-881e-714f47fd0ebf",
    "da203dd1-23ea-4784-9267-e107cd928bf4",
    "daf9cd9f-ccb7-4688-8681-31beb5b2dc62",
    "e153b4e2-9aaf-4cc3-bcf1-809e34bf90d6",
    "ea855082-1d5a-4859-affb-57a2bcc016f1",
    "eec8f192-897a-4898-a243-1602f2404fb8",
    "f27da5be-cc44-4811-ab08-3ded7156605a",
    "fa8be9a8-1e0f-4e31-a713-d92ec93ace81",
    "faa61b04-f0c0-4e77-b206-beaf9d93208e",
    "fab40697-810c-49a2-b6e6-fb92b7cffab1",
    "fd2fc524-9196-41e8-9ecb-2f4e8e8ded50"
]


filtered_df = datasets_df[datasets_df["workspaceId"].isin(workspace_ids)]


StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 11, Finished, Available, Finished)

In [None]:
len(filtered_df)

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 29, Finished, Available, Finished)

1165

In [11]:
#get_dataset_refresh_histroy
def get_dataset_refresh_histroy(workspaceId, datasetId):
    """Fetch dataset refresh history from Power BI API."""
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}

    url = f"https://api.powerbi.com/v1.0/myorg/groups/{workspaceId}/datasets/{datasetId}/refreshes"
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        return pd.DataFrame([{"datasetId": datasetId} | {key: None for key in [
            "requestId", "id", "refreshType", "status", "refreshAttempts",
            "startTime", "endTime", "DataStartTime", "DataEndTime", 
            "QueryStartTime", "QueryEndTime"]}])

    res = response.json()
    if not res.get("value"):  # If "value" is empty or missing, return default None values
        return pd.DataFrame([{"datasetId": datasetId} | {key: None for key in [
            "requestId", "id", "refreshType", "status", "refreshAttempts",
            "startTime", "endTime", "DataStartTime", "DataEndTime", 
            "QueryStartTime", "QueryEndTime"]}])

    records = [{
        "datasetId": datasetId,
        "requestId": entry.get("requestId", None),
        "id": entry.get("id", None),
        "refreshType": entry.get("refreshType", None),
        "status": entry.get("status", None),
        "refreshAttempts": len(entry.get("refreshAttempts", [])),
        "startTime": entry.get("startTime", None),
        "endTime": entry.get("endTime", None),
        "DataStartTime": next((att.get("startTime") for att in entry.get("refreshAttempts", []) if att.get("type") == "Data"), None),
        "DataEndTime": next((att.get("endTime") for att in entry.get("refreshAttempts", []) if att.get("type") == "Data"), None),
        "QueryStartTime": next((att.get("startTime") for att in entry.get("refreshAttempts", []) if att.get("type") == "Query"), None),
        "QueryEndTime": next((att.get("endTime") for att in entry.get("refreshAttempts", []) if att.get("type") == "Query"), None)
    } for entry in res["value"]]

    return pd.DataFrame(records)


StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 13, Finished, Available, Finished)

In [12]:
all_temp_dfs = []  # List to store non-empty DataFrames

for _, row in filtered_df.iterrows():
    temp_df = get_dataset_refresh_histroy(row["workspaceId"], row["id"])
    
    if not temp_df.empty:  # Append only if temp_df is not empty
        all_temp_dfs.append(temp_df)
    else:
        print("No data for:", row)


StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 14, Finished, Available, Finished)

In [25]:
datasets_refresh_history_df = pd.concat(all_temp_dfs, ignore_index=True)
#Creating Spark DataFrame
spark_datasets_refresh_history_df = spark.createDataFrame(datasets_refresh_history_df)

# Read existing requestIds from datasets_refresh_history table
#existing_df = spark.read.format("delta").load("abfss://d3120490-76ae-4ef4-a440-2bd65732ccdc@onelake.dfs.fabric.microsoft.com/fecab367-5d3a-41c1-8037-7801192932ba/Tables/datasets_refresh_history").select("requestId")

existing_df = (
    spark.read.format("delta")
    .load("abfss://d3120490-76ae-4ef4-a440-2bd65732ccdc@onelake.dfs.fabric.microsoft.com/fecab367-5d3a-41c1-8037-7801192932ba/Tables/datasets_refresh_history")
    .select("requestId")
    .dropna(subset=["requestId"])  # Remove null requestId
    .dropDuplicates(["requestId"])  # Remove duplicates
)

new_spark_datasets_refresh_history_df = spark_datasets_refresh_history_df.join(existing_df, "requestId", "left_anti")

if new_spark_datasets_refresh_history_df.count() > 0:
    new_spark_datasets_refresh_history_df.write.format("delta").mode("append").saveAsTable("datasets_refresh_history")
    print(f"Appended {new_spark_datasets_refresh_history_df.count()} new records to datasets_refresh_history.")
else:
    print("No new records found. Skipping append.")

StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 27, Finished, Available, Finished)

Appended 830 new records to datasets_refresh_history.


In [26]:

# Load the Delta table
df = spark.read.format("delta").load("abfss://d3120490-76ae-4ef4-a440-2bd65732ccdc@onelake.dfs.fabric.microsoft.com/fecab367-5d3a-41c1-8037-7801192932ba/Tables/datasets_refresh_history")

# Remove duplicate rows based on all columns
df_deduplicated = df.dropDuplicates()

# Overwrite the table with deduplicated data
df_deduplicated.write.format("delta").mode("overwrite").save("abfss://d3120490-76ae-4ef4-a440-2bd65732ccdc@onelake.dfs.fabric.microsoft.com/fecab367-5d3a-41c1-8037-7801192932ba/Tables/datasets_refresh_history")

print("Duplicate rows removed, and table updated.")


StatementMeta(, 6c416c47-937d-47c9-8c1d-61b27007baf5, 28, Finished, Available, Finished)

Duplicate rows removed, and table updated.
