In [4]:
# add a spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").getOrCreate()
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.window import Window


def calculate_percentage_rank(
    df: DataFrame, key_column: str, duration_column: str
) -> DataFrame:
    """
    Calculate the percentage rank of the duration column for each key in the DataFrame.

    :param df: Input DataFrame containing the data.
    :param key_column: Name of the key column (e.g., CASE_KEY).
    :param duration_column: Name of the duration column (should be of float type).
    :return: DataFrame with an additional column 'Percentage_Rank' indicating the rank of the duration.
    """
    # Window specification for ranking the durations
    window_spec = Window.orderBy(col(duration_column))

    # Rank the durations (ascending order, lower duration gets higher rank)
    df = df.withColumn("Duration_Rank", col(duration_column).asc().over(window_spec))

    # Calculate the maximum rank value
    max_rank = df.selectExpr(f"max(Duration_Rank) as max_rank").collect()[0]["max_rank"]

    # Calculate percentage rank
    df = df.withColumn(
        "Percentage_Rank",
        100 * (1 - ((col(duration_column).desc().over(window_spec)) / max_rank)),
    )

    return df