In [1]:
import pandas as pd


In [2]:
# Load data
excel_file = "Fabric_CU_metrics14days.xlsx"
df = pd.read_excel(excel_file)

csv_file = "fabric_capacity_units.csv"
df.to_csv(csv_file, index=False)
df

Unnamed: 0,Workspace,Item kind,Item name,CU (s),Duration (s),Users,Rejected count,Billing type
0,MDP-FABRIC-PRD,Dataflow,LEM_DataFlow_PRD,1.267411e+07,347485.003,1.0,0.0,Billable
1,MDP-FABRIC-PRD,SynapseNotebook,NB_Infor_101_Ingest_Notebooks,5.341069e+06,105393.327,1.0,0.0,Billable
2,MDP-FABRIC-PRD,Dataflow,MasterBuilder_DataFlow_PRD,2.888313e+06,11483.654,1.0,0.0,Billable
3,MDP-FABRIC-PRD,SynapseNotebook,NB_Infor_100_Refresh_ReplicationSet_And_GL,1.635235e+06,37332.999,1.0,0.0,Billable
4,MDP-FABRIC-PRD,Dataset,Infor_PRD,1.582654e+06,105300.394,3.0,0.0,Billable
...,...,...,...,...,...,...,...,...
67,MDP-FABRIC-PRD,Dataset,Coupa_PRD,2.539200e+01,10.398,3.0,0.0,Billable
68,MDP-FABRIC-PRD,Dataset,MasterBuilder_PRD,1.008000e+00,0.970,2.0,0.0,Billable
69,Total,,,3.525187e+07,2809062.193,16.0,0.0,Billable
70,,,,,,,,


In [3]:
def clean_data(df):
    """Cleans and preprocesses the given DataFrame by:
    - Dropping rows with NaN values in any column
    - Extracting and renaming specific columns (Item name and CU)
    - Converting CU(s) to CU(h) if necessary

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    
    # Drop rows that contain any NaN values in any column
    df = df.dropna(how='any', axis=0)

    # Check if the essential columns exist to avoid KeyError during extraction
    if "Item name" in df.columns and "CU (s)" in df.columns:
        # Extract 'Item name' and 'CU (s)', and rename the columns
        df = df[["Item name", "CU (s)"]]
        df = df.rename(columns={"Item name": "Item_name", "CU (s)": "CU(s)"})
        
        # Convert CU(s) to CU(h) if CU(s) exists
        if "CU(s)" in df.columns:
            df["CU(h)"] = df["CU(s)"] / 3600

    return df


In [4]:
df = clean_data(df)
df

Unnamed: 0,Item_name,CU(s),CU(h)
0,LEM_DataFlow_PRD,1.267411e+07,3520.587449
1,NB_Infor_101_Ingest_Notebooks,5.341069e+06,1483.630283
2,MasterBuilder_DataFlow_PRD,2.888313e+06,802.309191
3,NB_Infor_100_Refresh_ReplicationSet_And_GL,1.635235e+06,454.231913
4,Infor_PRD,1.582654e+06,439.626196
...,...,...,...
64,PL_FieldForce_100_IngestMaster_AT,2.620800e+02,0.072800
65,PL_FieldForce_101_IngestOrchestrator_AT,2.620800e+02,0.072800
66,NB_PipelineStatusEmail,1.391205e+02,0.038645
67,Coupa_PRD,2.539200e+01,0.007053


### CU Calculation based on Sku Type

In [15]:
def calculate_CU_and_cost(sku_capacity, period="day", value=1):
    """Calculates CU allocation and cost for a given SKU, period, and value (duration).

    Args:
        sku_capacity (int): The CU capacity of the SKU (e.g., 64 for F64, 32 for F32).
        period (str): The time period for calculation. Options: "hour", "day", "month", "year".
        value (int or float): The number of periods (e.g., 5 for 5 days).

    Returns:
        tuple: A tuple containing CU-seconds, CU-hours, and the costs for PAYG and reservation.
    """
    # Time conversion factors
    hours_per_day = 24
    days_per_month = 30
    days_per_year = 365

    # Define period multipliers
    period_multipliers = {
        "hour": 1,
        "day": hours_per_day,
        "month": days_per_month * hours_per_day,
        "year": days_per_year * hours_per_day
    }

    # Validate period
    if period not in period_multipliers:
        raise ValueError("Invalid period. Choose from 'hour', 'day', 'month', or 'year'.")

    # Define cost rates per 64 CU-hours
    PAYG_rate_per_64_CU_hours = 11.52 
    reservation_rate_per_64_CU_hours = 6.853  # Reservation cost for 64 CU-hours

    # Compute total CU-hours based on the selected period and value
    multiplier = period_multipliers[period] * value
    total_CU_hours = sku_capacity * multiplier

    # Calculate the number of "64 CU-hour blocks" for cost calculation
    num_64_CU_hour_blocks = total_CU_hours / 64

    # Calculate costs for PAYG and reservation
    cost_PAYG = num_64_CU_hour_blocks * PAYG_rate_per_64_CU_hours
    cost_reservation = num_64_CU_hour_blocks * reservation_rate_per_64_CU_hours

    # Calculate total CU-seconds
    total_CU_seconds = total_CU_hours * 3600  # Convert CU-hours to CU-seconds

    return total_CU_seconds, total_CU_hours, cost_PAYG, cost_reservation

# Example usage
CU_seconds, CU_hours, cost_PAYG, cost_reservation = calculate_CU_and_cost(64, "day", 1)

# Print the results
print(f"CU-seconds: {round(CU_seconds, 2)} seconds")
print(f"CU-hours: {round(CU_hours, 2)} hours")
print(f"Cost (PAYG): ${round(cost_PAYG, 2)}")
print(f"Cost (Reservation): ${round(cost_reservation, 2)}")


CU-seconds: 5529600 seconds
CU-hours: 1536 hours
Cost (PAYG): $276.48
Cost (Reservation): $164.47


In [None]:
from pyspark.sql.functions import col

# Define global cost rates
TOTAL_DAILYCOST_PAYG = 11.52 * 24
TOTAL_DAILYCOST_RESERVED = 6.853 * 24
TOTAL_CU_HOURS = 1536
WEEK = 7
MONTH = 30
YEAR = 356


def calculate_execution_costs(df_filtered):
    """
    Calculate PAYG and Reserved cost per execution based on Compute Unit (CU) usage.

    Parameters:
    df_filtered (pd.DataFrame): Input DataFrame containing CU(h) column.
    total_CU_hours (int, optional): Total Compute Units per hour for the given capacity. Default is 1536 for F64.

    Returns:
    pd.DataFrame: Updated DataFrame with PAYG and Reserved costs.
    """

    # Calculate cost per execution
    df_filtered["CU(h) per day"] = df_filtered["CU(h)"] / 14
    df_filtered["PAYG Cost($)"] = (
        (df_filtered["CU(h)"] / 14) / TOTAL_CU_HOURS
    ) * TOTAL_DAILYCOST_PAYG
    df_filtered["Reserved Cost($)"] = (
        (df_filtered["CU(h)"] / 14) / TOTAL_CU_HOURS
    ) * TOTAL_DAILYCOST_RESERVED

    print("\n" + "🔹" * 20)
    print("🔹🔹🔹 Cost Per Day for Each Execution 🔹🔹🔹")
    print("🔹" * 20 + "\n")
    return df_filtered


df_filtered = calculate_execution_costs(df)
df_filtered.sort_values(by="CU(h)", ascending=False)


🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹
🔹🔹🔹 Cost Per Day for Each Execution 🔹🔹🔹
🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹



Unnamed: 0,Item_name,CU(s),CU(h),CU(h) per day,PAYG Cost($),Reserved Cost($)
0,LEM_DataFlow_PRD,1.267411e+07,3520.587449,251.470532,45.264696,26.926993
1,NB_Infor_101_Ingest_Notebooks,5.341069e+06,1483.630283,105.973592,19.075246,11.347453
2,MasterBuilder_DataFlow_PRD,2.888313e+06,802.309191,57.307799,10.315404,6.136412
3,NB_Infor_100_Refresh_ReplicationSet_And_GL,1.635235e+06,454.231913,32.445137,5.840125,3.474164
4,Infor_PRD,1.582654e+06,439.626196,31.401871,5.652337,3.362453
...,...,...,...,...,...,...
64,PL_FieldForce_100_IngestMaster_AT,2.620800e+02,0.072800,0.005200,0.000936,0.000557
65,PL_FieldForce_101_IngestOrchestrator_AT,2.620800e+02,0.072800,0.005200,0.000936,0.000557
66,NB_PipelineStatusEmail,1.391205e+02,0.038645,0.002760,0.000497,0.000296
67,Coupa_PRD,2.539200e+01,0.007053,0.000504,0.000091,0.000054


In [None]:
# Compute all the costs for all synapse notebooks and pipelines run
# Do this for daily, weekly, monthly, and yearly costs
summary_df = pd.DataFrame(
    {
        "Total CU(h)": [
            df_filtered["CU(h)"].sum(),
            df_filtered["CU(h)"].sum() * WEEK,
            df_filtered["CU(h)"].sum() * MONTH,
            df_filtered["CU(h)"].sum() * YEAR,
        ],
        "Total PAYG Cost ($)": [
            df_filtered["PAYG Cost($)"].sum(),
            df_filtered["PAYG Cost($)"].sum() * WEEK,
            df_filtered["PAYG Cost($)"].sum() * MONTH,
            df_filtered["PAYG Cost($)"].sum() * YEAR,
        ],
        "Total Reserved Cost ($)": [
            df_filtered["Reserved Cost($)"].sum(),
            df_filtered["Reserved Cost($)"].sum() * WEEK,
            df_filtered["Reserved Cost($)"].sum() * MONTH,
            df_filtered["Reserved Cost($)"].sum() * YEAR,
        ],
        "Total CU Utilization(%)": [
            df_filtered["CU(h)"].sum() / TOTAL_CU_HOURS,
            (df_filtered["CU(h)"].sum() * WEEK) / (TOTAL_CU_HOURS * WEEK),
            (df_filtered["CU(h)"].sum() * MONTH) / (TOTAL_CU_HOURS * MONTH),
            (df_filtered["CU(h)"].sum() * YEAR) / (TOTAL_CU_HOURS * YEAR),
        ],
    },
    index=["Daily", "Weekly", "Monthly", "Yearly"],
)

# Print another distinct heading
print("\n" + "🔹" * 28)
print("◽◽ COST SUMMARY For All Current Runs Schedules In All Projects ◽◽")
print("🔹" * 28 + "\n")

# Display the extended summary DataFrame
display(summary_df)

In [None]:
(df_filtered["CU(h)"].sum() / 14) * 30 * 0.18