# Event Analysis


In [None]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.getOrCreate()

## Import Libraries


In [None]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

## Read data


In [None]:
df = spark.read.csv("Raw/data.csv", header=True, inferSchema=False, sep=";")
df = df.withColumn("Datum", F.to_timestamp("Datum", "d.M.yy h:mm"))
# drop cases with missing case_key
df = df.filter(F.col("case_key").isNotNull())

## Analysis


In [None]:
def event_sequence_overview(
    df: DataFrame, case_key: str, event: str, timestamp: str
) -> DataFrame:
    """
    Calculate the most frequent sequence of events and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.

    Returns:
        DataFrame: The DataFrame with the most frequent sequence of events and their percentage.

    Raises:
        ValueError: If any of the specified columns do not exist in the DataFrame.

    Example:
        >>> from pyspark.sql import SparkSession
        >>> from pyspark.sql.functions import col
        >>> spark = SparkSession.builder.appName("Example").getOrCreate()
        >>> data = [("A1", "2022-01-23 08:00", "Antrag Start", "AT"),
        ...         ("A1", "2022-01-23 08:10", "Fristablauf ext.", "Signatul BU"),
        ...         ("A1", "2022-01-23 08:15", "Vorschlag", ""),
        ...         ("A2", "2022-01-23 10:00", "Antrag Start", ""),
        ...         ("A2", "2022-01-23 10:20", "Sync", ""),
        ...         ("A2", "2022-01-23 10:50", "Laden", None),
        ...         ("A2", "2022-01-23 11:00", "Antrag (VE)", ""),
        ...         ("A2", "2022-01-23 11:01", "Geloescht", "")]
        >>> columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
        >>> df = spark.createDataFrame(data, schema=columns)
        >>> result_df = event_sequence_overview(df, "CASE_KEY", "Funktion", "Datum")
        >>> result_df.show(truncate=False)
    """
    # Validate that the specified columns exist in the DataFrame
    for col_name in [case_key, event, timestamp]:
        if col_name not in df.columns:
            raise ValueError(f"The column '{col_name}' does not exist in the DataFrame")

    # Define the window specification
    window = Window.partitionBy(case_key).orderBy(timestamp)

    # Create a column for the next event in the sequence
    df = df.withColumn("next_event", F.lead(event).over(window))

    # Filter out rows where next_event is null
    df_event_sequence = df.filter(F.col("next_event").isNotNull())

    # Create a column for the event sequence
    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat(F.col(event), F.lit(" -> "), F.col("next_event"))
    )

    # Calculate the most frequent event sequences
    df_most_frequent_sequence = (
        df_event_sequence.groupBy("event_sequence")
        .agg(F.countDistinct(case_key).alias("count"))
        .orderBy(F.desc("count"))
    )

    # Calculate the total number of unique case keys
    total_case_keys = df.select(case_key).distinct().count()

    # Calculate the percentage of each event sequence
    df_most_frequent_sequence = df_most_frequent_sequence.withColumn(
        "percentage", (F.col("count") / total_case_keys) * 100
    )

    return df_most_frequent_sequence

In [None]:
def find_case_keys_for_event_sequence(
    df: DataFrame, case_key: str, event: str, timestamp: str, sequence: str
) -> DataFrame:
    """
    Find the case keys for a particular sequence of events and return the result as a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.
        sequence (str): The sequence of events.

    Returns:
        DataFrame: The DataFrame with the case keys for the sequence.

    Raises:
        ValueError: If any of the specified columns do not exist in the DataFrame.

    Example:
        >>> from pyspark.sql import SparkSession
        >>> spark = SparkSession.builder.appName("Example").getOrCreate()
        >>> data = [("A1", "2022-01-23 08:00", "Antrag Start", "AT"),
        ...         ("A1", "2022-01-23 08:10", "Fristablauf ext.", "Signatul BU"),
        ...         ("A1", "2022-01-23 08:15", "Vorschlag", ""),
        ...         ("A2", "2022-01-23 10:00", "Antrag Start", ""),
        ...         ("A2", "2022-01-23 10:20", "Sync", ""),
        ...         ("A2", "2022-01-23 10:50", "Laden", None),
        ...         ("A2", "2022-01-23 11:00", "Antrag (VE)", ""),
        ...         ("A2", "2022-01-23 11:01", "Geloescht", "")]
        >>> columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
        >>> df = spark.createDataFrame(data, schema=columns)
        >>> result_df = find_case_keys_for_event_sequence(df, "CASE_KEY", "Funktion", "Datum", "Antrag Start -> Fristablauf ext.")
        >>> result_df.show(truncate=False)
    """
    # Validate that the specified columns exist in the DataFrame
    for col_name in [case_key, event, timestamp]:
        if col_name not in df.columns:
            raise ValueError(
                f"The column '{col_name}' does not exist in the DataFrame")

    # Define the window specification
    window = Window.partitionBy(case_key).orderBy(timestamp)

    # Create a column for the next event in the sequence
    df = df.withColumn("next_event", F.lead(event).over(window))

    # Filter out rows where next_event is null
    df_event_sequence = df.filter(F.col("next_event").isNotNull())

    # Create a column for the event sequence
    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat(
            F.col(event), F.lit(" -> "), F.col("next_event"))
    )

    # Filter for the specified event sequence and select distinct case keys
    df_case_keys_for_sequence = (
        df_event_sequence.filter(F.col("event_sequence") == sequence)
        .select(case_key)
        .distinct()
    )

    return df_case_keys_for_sequence

In [None]:
def event_sequence_total(
    df: DataFrame, case_key: str, event: str, timestamp: str
) -> DataFrame:
    """
    Calculate the count and percentage of sequences over all events, starting from the first event until the last event, and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.

    Returns:
        DataFrame: The dataframe with the percentage of the most frequent sequence over all events.

    Example:
        result_df = calculate_percentage_most_frequent_sequence_all_events(df, "CASE_KEY", "Funktion", "Timestamp")
    """
    window = Window.partitionBy(case_key).orderBy(timestamp)

    df = df.withColumn("next_event", F.lead(event).over(window))

    df_event_sequence = df.filter(F.col("next_event").isNotNull())

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat(F.col(event), F.lit(" -> "), F.col("next_event"))
    )

    df_event_sequence = df_event_sequence.groupBy(case_key).agg(
        F.collect_list("event_sequence").alias("event_sequence")
    )

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat_ws(", ", "event_sequence")
    )

    df_most_frequent_sequence_all_events = (
        df_event_sequence.groupBy("event_sequence").count().orderBy(F.desc("count"))
    )

    total_count = df_most_frequent_sequence_all_events.select(F.sum("count")).first()[0]

    df_most_frequent_sequence_all_events = (
        df_most_frequent_sequence_all_events.withColumn(
            "percentage", (F.col("count") / total_count) * 100
        )
    )

    return df_most_frequent_sequence_all_events


event_sequence_total(df, "case_key", "Funktion", "Datum").show(truncate=False)

In [None]:
def find_case_keys_for_event_sequence_total(
    df: DataFrame, case_key: str, event: str, timestamp: str, sequence: str
) -> DataFrame:
    """
    Find the case keys for a particular full sequence of events and return the result as a dataframe.

    Args:
        df (DataFrame): The input dataframe.
        case_key (str): The column name for the case key.
        event (str): The column name for the event.
        timestamp (str): The column name for the timestamp.
        sequence (str): The sequence of events.

    Returns:
        DataFrame: The dataframe with the case keys for the sequence.

    Example:
        result_df = find_case_keys_for_full_sequence(df, "CASE_KEY", "Funktion", "Timestamp", "event1 -> event2, event2 -> event3")
    """
    window = Window.partitionBy(case_key).orderBy(timestamp)

    df = df.withColumn("next_event", F.lead(event).over(window))

    df_event_sequence = df.filter(F.col("next_event").isNotNull())

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat(F.col(event), F.lit(" -> "), F.col("next_event"))
    )

    df_event_sequence = df_event_sequence.groupBy(case_key).agg(
        F.collect_list("event_sequence").alias("event_sequence")
    )

    df_event_sequence = df_event_sequence.withColumn(
        "event_sequence", F.concat_ws(", ", "event_sequence")
    )

    df_case_keys_for_sequence = df_event_sequence.filter(
        F.col("event_sequence") == sequence
    ).select(case_key)

    return df_case_keys_for_sequence


sequence = "Antrag Start -> Sync, Sync -> Laden, Laden -> Antrag (VE), Antrag (VE) -> Geloescht"
find_case_keys_for_event_sequence_total(
    df, "CASE_KEY", "Funktion", "Datum", sequence
).show(truncate=False)