# Event Analysis


In [None]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.getOrCreate()

## Import Libraries


In [None]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

## Read data


In [None]:
df = spark.read.csv("Raw/data.csv", header=True, inferSchema=False, sep=";")
df = df.withColumn("Datum", F.to_timestamp("Datum", "d.M.yy h:mm"))
# drop cases with missing case_key
df = df.filter(F.col("case_key").isNotNull())

## Analysis Overview


In [None]:
def case_event_overview(
    df: DataFrame, case_column: str, event_column: str
) -> DataFrame:
    """
    Calculate the count of each event for each case and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_column (str): The column name for the case key.
        event_column (str): The column name for the event.

    Returns:
        DataFrame: A DataFrame with the count of each event for each case key.

    Example:
        result_df = count_events_per_case(df, "CASE_KEY", "Funktion")
    """
    # Group by the case column and pivot on the event column, counting occurrences
    event_counts = df.groupBy(case_column).pivot(event_column).count().na.fill(0)
    return event_counts


result_df = case_event_overview(df, "case_key", "Funktion")
result_df.show()

In [None]:
def case_event_count(df: DataFrame, case_key: str, event: str) -> DataFrame:
    """
    Calculate the event count and the number of distinct events for each case key, order the results in descending order by the number of events, and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.

    Returns:
        DataFrame: The dataframe with the count of each event and the number of distinct events for each case key, ordered by the number of events in descending order.

    Example:
        example_data = [("CASE1", "Event1"), ("CASE1", "Event2"), ("CASE2", "Event1"), ("CASE1", "Event1")]
        example_df = spark.createDataFrame(example_data, ["CASE_KEY", "Funktion"])
        result_df = calculate_number_of_events_per_case(example_df, "CASE_KEY", "Funktion")
        result_df.show()
    """
    df_event_count_per_case = df.groupBy(case_key).agg(
        F.count(event).alias("Number of Events"),
        F.countDistinct(event).alias("Number of Distinct Events"),
    )
    df_event_count_per_case = df_event_count_per_case.orderBy(
        F.desc("Number of Events")
    )
    return df_event_count_per_case

In [None]:
def event_count(df: DataFrame, case_key: str, event: str) -> DataFrame:
    """
    Count the number of distinct cases and the total number of events for each case and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.

    Returns:
        DataFrame: The dataframe with the count of distinct cases and the total number of events for each case.

    Example:
        example_data = [("CASE1", "Event1"), ("CASE1", "Event2"), ("CASE2", "Event1"), ("CASE1", "Event1")]
        example_df = spark.createDataFrame(example_data, ["CASE_KEY", "Funktion"])
        result_df = count_cases_and_events(example_df, "CASE_KEY", "Funktion")
        result_df.show()
    """
    df_cases_and_events = df.groupBy(event).agg(
        F.countDistinct(case_key).alias("Number of Distinct Cases"),
        F.count(case_key).alias("Total Number Cases"),
    )
    df_cases_and_events = df_cases_and_events.withColumn(
        "Iterations", F.col("Total Number Cases") - F.col("Number of Distinct Cases")
    )
    df_cases_and_events = df_cases_and_events.orderBy(
        F.col("Number of Distinct Cases").desc()
    )
    return df_cases_and_events

In [None]:
def event_percentage(df: DataFrame, case_key: str, event: str) -> DataFrame:
    """
    Calculate the percentage of cases that had each event, order the results in descending order, and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.

    Returns:
        DataFrame: The dataframe with the percentage of cases for each event, ordered in descending order.

    Example:
        example_data = [("CASE1", "Event1"), ("CASE1", "Event2"), ("CASE2", "Event1"), ("CASE1", "Event1")]
        example_df = spark.createDataFrame(example_data, ["CASE_KEY", "Funktion"])
        result_df = calculate_event_percentage(example_df, "CASE_KEY", "Funktion")
        result_df.show()
    """
    total_cases = df.select(case_key).distinct().count()
    df_event_percentage = df.groupBy(event).agg(
        (F.countDistinct(case_key) / total_cases * 100).alias(
            "Percentage of Distinct Cases"
        ),
        (F.count(case_key) / total_cases * 100).alias("Percentage of Cases"),
    )
    df_event_percentage = df_event_percentage.orderBy(F.desc("Percentage of Cases"))
    return df_event_percentage


event_percentage(df, "case_key", "Funktion").show()

In [None]:
def event_start_end_percentage(
    df: DataFrame, case_key: str, event: str, timestamp: str
) -> DataFrame:
    """
    Calculate the percentage of case keys that start with an event and the percentage that end with an event, order the results by the highest percentage, and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.

    Returns:
        DataFrame: The dataframe with the percentage of case keys that start with an event and the percentage that end with an event, ordered by the highest percentage.

    Example:
        example_data = [("CASE1", "Event1", "2022-01-01"), ("CASE1", "Event2", "2022-01-02"), ("CASE2", "Event1", "2022-01-01"), ("CASE1", "Event1", "2022-01-03")]
        example_df = spark.createDataFrame(example_data, ["CASE_KEY", "Funktion", "Timestamp"])
        result_df = calculate_start_end_event_percentage(example_df, "CASE_KEY", "Funktion", "Timestamp")
        result_df.show()
    """
    window = Window.partitionBy(case_key).orderBy(timestamp)
    df_start_end_events = df.withColumn("Start Event", F.first(event).over(window))
    df_start_end_events = df_start_end_events.withColumn(
        "End Event", F.last(event).over(window)
    )

    df_start_end_events = df_start_end_events.groupby(case_key).agg(
        F.first("Start Event").alias("Start Event"),
        F.last("End Event").alias("End Event"),
    )

    total_cases = df.select(case_key).distinct().count()

    df_start_event_percentage = (
        df_start_end_events.groupBy("Start Event")
        .agg(
            (F.countDistinct(case_key) / total_cases * 100).alias(
                "Percentage of Cases Starting"
            )
        )
        .orderBy(F.desc("Percentage of Cases Starting"))
    )

    df_end_event_percentage = (
        df_start_end_events.groupBy("End Event")
        .agg(
            (F.countDistinct(case_key) / total_cases * 100).alias(
                "Percentage of Cases Ending"
            )
        )
        .orderBy(F.desc("Percentage of Cases Ending"))
    )

    return df_start_event_percentage.join(
        df_end_event_percentage,
        df_start_event_percentage["Start Event"]
        == df_end_event_percentage["End Event"],
        "outer",
    )