# Time Analysis


In [None]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.getOrCreate()

## Import Libraries


In [None]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import DataFrame
from pyspark.sql.functions import broadcast

## Read data


In [None]:
df = spark.read.csv("Raw/data.csv", header=True, inferSchema=False, sep=";")
df = df.withColumn("Datum", F.to_timestamp("Datum", "d.M.yy h:mm"))
# drop cases with missing case_key
df = df.filter(F.col("case_key").isNotNull())

## Analysis


In [None]:
def event_time_dist(
    df: DataFrame, event: str, timestamp: str, time_var: str
) -> DataFrame:
    """
    Calculate the number of occurrences of each event over the specified time variable and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.
        time_var (str): The time variable to group by (hour, day, week, or month).

    Returns:
        DataFrame: The dataframe with the number of occurrences of each event over the specified time variable.

    Example:
        result_df = calculate_event_occurrences(df, "Funktion", "Timestamp", "hour")
    """
    if time_var == "hour":
        df = df.withColumn(time_var, F.hour(F.col(timestamp)))
    elif time_var == "day":
        df = df.withColumn(time_var, F.dayofmonth(F.col(timestamp)))
    elif time_var == "week":
        df = df.withColumn(time_var, F.weekofyear(F.col(timestamp)))
    elif time_var == "month":
        df = df.withColumn(time_var, F.month(F.col(timestamp)))
    elif time_var == "year":
        df = df.withColumn(time_var, F.year(F.col(timestamp)))
    else:
        raise ValueError(
            "Invalid time_var. Choose from 'hour', 'day', 'week', or 'month'."
        )

    df_event_occurrences = df.groupBy(event, time_var).count().orderBy(event, time_var)

    return df_event_occurrences


event_time_dist(df, "Funktion", "Datum", "year").show(truncate=False)

# Duration between events


In [None]:
def case_throughput_time(
    df: DataFrame, case_column: str, timestamp_column: str, time_unit: str
) -> DataFrame:
    """
    Calculate the time difference from the first event to the last event for each case and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_column (str): The column name for the case key.
        timestamp_column (str): The column name for the timestamp.
        time_unit (str): The time unit to measure the time difference (seconds, minutes, hours, or days).

    Returns:
        DataFrame: A DataFrame with the time difference from the first event to the last event for each case.

    Example:
        result_df = calculate_case_throughput_time(df, "Case_Key", "Timestamp", "seconds")
        result_df.show()
    """
    # Define the window specification
    window_spec = Window.partitionBy(case_column)

    # Calculate the first and last timestamp for each case
    df = df.withColumn("first_timestamp", F.min(timestamp_column).over(window_spec))
    df = df.withColumn("last_timestamp", F.max(timestamp_column).over(window_spec))

    # Calculate the time difference in seconds
    df = df.withColumn(
        "time_difference_seconds",
        F.unix_timestamp("last_timestamp") - F.unix_timestamp("first_timestamp"),
    )

    # Convert the time difference to the specified unit
    conversion_factors = {"seconds": 1, "minutes": 60, "hours": 3600, "days": 86400}

    if time_unit not in conversion_factors:
        raise ValueError(
            "Invalid time_unit. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    conversion_factor = conversion_factors[time_unit]
    df = df.withColumn(
        "time_difference", F.col("time_difference_seconds") / conversion_factor
    )

    # Select the required columns and remove duplicates
    result_df = df.select(
        case_column, "first_timestamp", "last_timestamp", "time_difference"
    ).distinct()

    return result_df


# Example usage
# result_df = calculate_case_throughput_time(df, "Case_Key", "Datum", "minutes")
# result_df.show(truncate=False)
case_throughput_time(df, "case_key", "Datum", "minutes").show(truncate=False)

In [None]:
def event_duration_stats(
    df: DataFrame,
    case_column: str,
    timestamp_column: str,
    event_column: str,
    start_event: str,
    end_event: str,
    time_unit: str,
) -> DataFrame:
    """
    Calculate statistics on the duration between start and end events for each case and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_column (str): The column name for the case key.
        timestamp_column (str): The column name for the timestamp.
        event_column (str): The column name for the event.
        start_event (str): The name of the start event.
        end_event (str): The name of the end event.
        time_unit (str): The time unit to measure the duration (seconds, minutes, hours, or days).

    Returns:
        DataFrame: A DataFrame with statistics on the duration between the start and end events for each case.

    Example:
        result_df = calculate_event_duration_stats(df, "CASE_KEY", "Datum", "Funktion", "Antrag Start", "Sync", "minutes")
        result_df.show(truncate=False)
    """
    # Check if the necessary columns exist in the DataFrame
    required_columns = {case_column, timestamp_column, event_column}
    if not required_columns.issubset(df.columns):
        missing_cols = required_columns - set(df.columns)
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")

    # Filter for start and end events and rename timestamp columns
    start_df = (
        df.filter(F.col(event_column) == start_event)
        .select(case_column, timestamp_column)
        .withColumnRenamed(timestamp_column, "start_timestamp")
    )
    end_df = (
        df.filter(F.col(event_column) == end_event)
        .select(case_column, timestamp_column)
        .withColumnRenamed(timestamp_column, "end_timestamp")
    )

    # Check if the start and end DataFrames are not empty
    if start_df.count() == 0:
        raise ValueError(f"No rows with event '{start_event}'")
    if end_df.count() == 0:
        raise ValueError(f"No rows with event '{end_event}'")

    # Join start and end DataFrames on case key
    duration_df = start_df.join(end_df, on=case_column)

    # Calculate the duration in seconds
    duration_df = duration_df.withColumn(
        "duration_seconds",
        F.unix_timestamp("end_timestamp") - F.unix_timestamp("start_timestamp"),
    )

    # Convert the duration to the specified time unit
    time_units = {"seconds": 1, "minutes": 60, "hours": 3600, "days": 86400}
    if time_unit not in time_units:
        raise ValueError(
            "Invalid time_unit. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    conversion_factor = time_units[time_unit]
    duration_df = duration_df.withColumn(
        "duration", F.col("duration_seconds") / conversion_factor
    )

    # Calculate statistical measures on the duration
    stats_df = duration_df.agg(
        F.sum("duration").alias(f"total_duration_{time_unit}"),
        F.avg("duration").alias(f"average_duration_{time_unit}"),
        F.expr("percentile_approx(duration, 0.5)").alias(
            f"median_duration_{time_unit}"
        ),
        F.min("duration").alias(f"min_duration_{time_unit}"),
        F.max("duration").alias(f"max_duration_{time_unit}"),
        (F.max("duration") - F.min("duration")).alias(f"range_duration_{time_unit}"),
        F.variance("duration").alias(f"variance_duration_{time_unit}"),
        F.stddev("duration").alias(f"stddev_duration_{time_unit}"),
        F.skewness("duration").alias(f"skewness_duration_{time_unit}"),
        F.kurtosis("duration").alias(f"kurtosis_duration_{time_unit}"),
    )

    return stats_df


# Example usage
result_df = event_duration_stats(
    df, "CASE_KEY", "Datum", "Funktion", "Antrag Start", "Sync", "minutes"
)
result_df.show(truncate=False)

In [None]:
def throughput_time_stats(
    df: DataFrame,
    case_column: str,
    timestamp_column: str,
    event_column: str,
    time_unit: str,
) -> DataFrame:
    """
    Calculate statistics on the throughput time for each event and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_column (str): The column name for the case key.
        timestamp_column (str): The column name for the timestamp.
        event_column (str): The column name for the event.
        time_unit (str): The time unit to measure the duration (seconds, minutes, hours, or days).

    Returns:
        DataFrame: A DataFrame with statistics on the throughput time for each event.

    Example:
        result_df = calculate_throughput_time_stats(df, "CASE_KEY", "Datum", "Funktion", "minutes")
        result_df.show(truncate=False)
    """
    # Check if the necessary columns exist in the DataFrame
    required_columns = {case_column, timestamp_column, event_column}
    if not required_columns.issubset(df.columns):
        missing_cols = required_columns - set(df.columns)
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")

    # Add a column with Unix timestamps
    df = df.withColumn("unix_timestamp", F.unix_timestamp(timestamp_column))

    # Determine the time conversion factor
    time_units = {"seconds": 1, "minutes": 60, "hours": 3600, "days": 86400}
    if time_unit not in time_units:
        raise ValueError(
            "Invalid time_unit. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    time_factor = time_units[time_unit]

    # Calculate the duration for each event
    duration_df = df.groupBy(event_column).agg(
        ((F.max("unix_timestamp") - F.min("unix_timestamp")) / time_factor).alias(
            f"duration_{time_unit}"
        )
    )

    # Calculate statistical measures on the duration
    stats_df = duration_df.agg(
        F.avg(f"duration_{time_unit}").alias(f"average_duration_{time_unit}"),
        F.expr(f"percentile_approx(duration_{time_unit}, 0.5)").alias(
            f"median_duration_{time_unit}"
        ),
        F.min(f"duration_{time_unit}").alias(f"min_duration_{time_unit}"),
        F.max(f"duration_{time_unit}").alias(f"max_duration_{time_unit}"),
        F.variance(f"duration_{time_unit}").alias(
            f"variance_duration_{time_unit}"),
        F.stddev(f"duration_{time_unit}").alias(
            f"stddev_duration_{time_unit}"),
        F.skewness(f"duration_{time_unit}").alias(
            f"skewness_duration_{time_unit}"),
        F.kurtosis(f"duration_{time_unit}").alias(
            f"kurtosis_duration_{time_unit}"),
    )

    return stats_df


# Example usage
result_df = throughput_time_stats(
    df, "CASE_KEY", "Datum", "Funktion", "minutes")
result_df.show(truncate=False)

In [None]:
def event_transition_duration(
    df: DataFrame,
    case_column: str,
    timestamp_column: str,
    event_column: str,
    time_unit: str,
) -> DataFrame:
    """
    Calculate the duration of transitions between events and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_column (str): The column name for the case key.
        timestamp_column (str): The column name for the timestamp.
        event_column (str): The column name for the event.
        time_unit (str): The time unit to measure the duration (seconds, minutes, hours, or days).

    Returns:
        DataFrame: A DataFrame with statistics on the duration of transitions between events.

    Example:
        result_df = calculate_event_transition_duration(df, "case_key", "Datum", "Funktion", "minutes")
        result_df.show(truncate=False)
    """
    # Add a column with Unix timestamps
    df = df.withColumn("unix_timestamp", F.unix_timestamp(timestamp_column))

    # Determine the time conversion factor
    time_units = {"seconds": 1, "minutes": 60, "hours": 3600, "days": 86400}
    if time_unit not in time_units:
        raise ValueError(
            "Invalid time_unit. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    time_factor = time_units[time_unit]

    # Define the window specification
    window_spec = Window.partitionBy(case_column).orderBy("unix_timestamp")

    # Calculate the transition and duration
    df = df.withColumn("next_event", F.lead(event_column).over(window_spec))
    df = df.withColumn("prev_timestamp", F.lag(
        "unix_timestamp").over(window_spec))
    df = df.withColumn(
        f"duration_{time_unit}",
        (F.col("unix_timestamp") - F.col("prev_timestamp")) / time_factor,
    )
    df = df.withColumn("transition", F.concat_ws(
        " -> ", event_column, "next_event"))

    # Filter out rows where the next_event is null (end of case)
    df = df.filter(F.col("next_event").isNotNull())

    # Calculate statistical measures on the duration
    stats_df = (
        df.groupBy("transition")
        .agg(
            F.avg(f"duration_{time_unit}").alias(f"mean_duration_{time_unit}"),
            F.expr(f"percentile_approx(duration_{time_unit}, 0.5)").alias(
                f"median_duration_{time_unit}"
            ),
            F.min(f"duration_{time_unit}").alias(f"min_duration_{time_unit}"),
            F.max(f"duration_{time_unit}").alias(f"max_duration_{time_unit}"),
            F.variance(f"duration_{time_unit}").alias(f"variance_{time_unit}"),
            F.stddev(f"duration_{time_unit}").alias(f"stddev_{time_unit}"),
            F.skewness(f"duration_{time_unit}").alias(f"skewness_{time_unit}"),
            F.kurtosis(f"duration_{time_unit}").alias(f"kurtosis_{time_unit}"),
            (F.max(f"duration_{time_unit}") - F.min(f"duration_{time_unit}")).alias(
                f"range_{time_unit}"
            ),
        )
        .orderBy(F.desc(f"mean_duration_{time_unit}"))
    )

    return stats_df


# Example usage
result_df = event_transition_duration(
    df, "case_key", "Datum", "Funktion", "minutes")
result_df.show(truncate=False)

In [None]:
def case_event_duration_comparison(
    df: DataFrame, case_key: str, timestamp: str, event: str, time_var: str
) -> DataFrame:
    # Mapping for time conversion factors
    time_factors = {"seconds": 1, "minutes": 60, "hours": 3600, "days": 86400}

    if time_var not in time_factors:
        raise ValueError(
            "Invalid time_var. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    time_factor = time_factors[time_var]

    # Add unix timestamp column
    df = df.withColumn("unix_timestamp", F.unix_timestamp(timestamp))

    # Define window for partitioning by case key and ordering by timestamp
    window = Window.partitionBy(case_key).orderBy("unix_timestamp")

    # Calculate next event, transition, and duration
    df = (
        df.withColumn("next_event", F.lead(event).over(window))
        .withColumn("transition", F.concat_ws(" -> ", event, "next_event"))
        .withColumn("prev_timestamp", F.lag("unix_timestamp").over(window))
        .withColumn(
            f"duration_{time_var}",
            (F.col("unix_timestamp") - F.col("prev_timestamp")) / time_factor,
        )
    )

    # Filter out null transitions and cache the DataFrame
    df = df.filter(F.col("transition").isNotNull()).cache()

    # Calculate mean, median, and mean absolute deviation in a single groupBy operation
    agg_expr = {
        f"duration_{time_var}": "avg",
        f"duration_{time_var}": "percentile_approx(0.5)",
        f"absolute_deviation_{time_var}": "avg",
    }

    mean_durations = (
        df.withColumn(
            f"absolute_deviation_{time_var}",
            F.abs(
                F.col(f"duration_{time_var}")
                - F.avg(f"duration_{time_var}").over(window)
            ),
        )
        .groupBy("transition")
        .agg(
            F.avg(f"duration_{time_var}").alias(
                f"mean_transition_duration_{time_var}"),
            F.expr(f"percentile_approx(duration_{time_var}, 0.5)").alias(
                f"median_transition_duration_{time_var}"
            ),
            F.avg(f"absolute_deviation_{time_var}").alias(
                f"mean_absolute_deviation_{time_var}"
            ),
        )
    )

    # Use broadcast join if the mean_durations DataFrame is small
    df = df.join(broadcast(mean_durations), on="transition", how="left")

    # Calculate percentage deviations
    df = df.withColumn(
        f"percentage_deviation_{time_var}",
        (
            (
                F.col(f"duration_{time_var}")
                - F.col(f"mean_transition_duration_{time_var}")
            )
            / F.col(f"mean_transition_duration_{time_var}")
        )
        * 100,
    ).withColumn(
        f"percentage_median_deviation_{time_var}",
        (
            (
                F.col(f"duration_{time_var}")
                - F.col(f"median_transition_duration_{time_var}")
            )
            / F.col(f"median_transition_duration_{time_var}")
        )
        * 100,
    )

    # Order the DataFrame by mean transition duration and select required columns
    df = df.orderBy(F.col(f"mean_transition_duration_{time_var}").desc()).select(
        case_key,
        "transition",
        f"duration_{time_var}",
        f"mean_transition_duration_{time_var}",
        f"percentage_deviation_{time_var}",
        f"median_transition_duration_{time_var}",
        f"percentage_median_deviation_{time_var}",
        f"mean_absolute_deviation_{time_var}",
    )

    return df


# Example usage:
result_df = case_event_duration_comparison(
    df, "CASE_KEY", "Datum", "Funktion", "minutes"
)
result_df.show()

In [None]:
def calculate_most_frequent_sequence_all_events(
    df: DataFrame, case_key: str, event: str, timestamp: str, time_var: str
) -> DataFrame:
    df = df.withColumn("unix_timestamp", F.unix_timestamp(timestamp))

    if time_var == "seconds":
        time_factor = 1
    elif time_var == "minutes":
        time_factor = 60
    elif time_var == "hours":
        time_factor = 3600
    elif time_var == "days":
        time_factor = 86400
    else:
        raise ValueError(
            "Invalid time_var. Choose from 'seconds', 'minutes', 'hours', or 'days'."
        )

    window = Window.partitionBy(case_key).orderBy("unix_timestamp")

    df = df.withColumn("next_event", F.lead(event).over(window))

    df_event_sequence = df.filter(F.col("next_event").isNotNull())

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat(F.col(event), F.lit(" -> "), F.col("next_event"))
    )

    df_event_sequence = df_event_sequence.groupBy(case_key).agg(
        F.collect_list("event_sequence").alias("event_sequence"),
        (F.last("unix_timestamp") - F.first("unix_timestamp")).alias("duration"),
    )

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat_ws(", ", "event_sequence")
    )

    df_event_sequence = df_event_sequence.withColumn(
        f"duration_{time_var}", F.col("duration") / time_factor
    )

    df_most_frequent_sequence_all_events = (
        df_event_sequence.groupBy("event_sequence")
        .agg(
            F.count(case_key).alias("count"),
            F.min(f"duration_{time_var}").alias(f"min_duration_{time_var}"),
            F.max(f"duration_{time_var}").alias(f"max_duration_{time_var}"),
            F.mean(f"duration_{time_var}").alias(f"mean_duration_{time_var}"),
            F.expr(f"percentile_approx(duration_{time_var}, 0.5)").alias(
                f"median_duration_{time_var}"
            ),
            F.variance(f"duration_{time_var}").alias(f"variance_{time_var}"),
            F.stddev(f"duration_{time_var}").alias(f"stddev_{time_var}"),
            F.kurtosis(f"duration_{time_var}").alias(f"kurtosis_{time_var}"),
        )
        .withColumn(
            f"range_{time_var}",
            F.col(f"max_duration_{time_var}") - F.col(f"min_duration_{time_var}"),
        )
        .orderBy(F.desc("count"))
    )

    total_count = df_most_frequent_sequence_all_events.select(F.sum("count")).first()[0]

    df_most_frequent_sequence_all_events = (
        df_most_frequent_sequence_all_events.withColumn(
            "percentage", (F.col("count") / total_count) * 100
        )
    )

    return df_most_frequent_sequence_all_events


calculate_most_frequent_sequence_all_events(
    df, "case_key", "Funktion", "Datum", "minutes"
).show(truncate=False)

In [None]:
# close the spark session
spark.stop()