# Data Exploration


In [1]:
# import pyspark and start a session
from datetime import datetime
from pyspark.sql import SparkSession

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    TimestampType,
    DoubleType,
    IntegerType,
    BooleanType,
)

import pyspark.sql.functions as F

# import the window function module
from pyspark.sql.window import Window

# import row_number function
from pyspark.sql.functions import row_number

spark = SparkSession.builder.appName("spark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/28 18:16:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Define a function to convert string to timestamp
def str_to_timestamp(str_date):
    return datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S")


# Define schema for the DataFrame
schema = StructType(
    [
        StructField("VorgangsID", StringType(), True),
        StructField("Datum", TimestampType(), True),
        StructField("Funktion", StringType(), True),
        StructField("Tarifname", StringType(), True),
        StructField("Sparte", StringType(), True),
        StructField("Betrag", DoubleType(), True),
        StructField("Beitragssumme", DoubleType(), True),
        StructField("Zahlweise", StringType(), True),
        StructField("Waehrung", StringType(), True),
        StructField("Bewertungsstuffe", IntegerType(), True),
        StructField("Vorschlagsnummer", StringType(), True),
        StructField("Benutzername", StringType(), True),
        StructField("Benutzergruppe", StringType(), True),
        StructField("SichtenID", StringType(), True),
        StructField("IDzurAggtNr", StringType(), True),
        StructField("KZ_Risikoprüfung", BooleanType(), True),
        StructField("AIS_CaseID", StringType(), True),
        StructField("EsignFaehig", BooleanType(), True),
        StructField("Antragsnummer", StringType(), True),
        StructField("Vertriebsstelle", StringType(), True),
    ]
)

# Sample data
data_1 = [
    (
        "ID1",
        str_to_timestamp("2024-04-27 08:30:00"),
        "Berechnung",
        "Tariff1",
        "BU",
        1000.0,
        500.0,
        "Jährlich",
        "EUR",
        3,
        "Vorschlag1",
        "User1",
        "Berater",
        "View1",
        "Agent1",
        True,
        "Case1",
        True,
        "App1",
        "SalesPoint1",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 10:45:00"),
        "Antrag (Papier)",
        "Tariff2",
        "EU",
        1500.0,
        750.0,
        "Monatlich",
        "EUR",
        4,
        "Vorschlag2",
        "User2",
        "Kunde",
        "View2",
        "Agent2",
        False,
        "Case2",
        False,
        "App2",
        "SalesPoint2",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 12:00:00"),
        "Laden",
        "Tariff3",
        "BU",
        2000.0,
        1000.0,
        "Jährlich",
        "EUR",
        5,
        "Vorschlag3",
        "User3",
        "Berater",
        "View3",
        "Agent3",
        True,
        "Case3",
        True,
        "App3",
        "SalesPoint3",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 13:15:00"),
        "Sync",
        "Tariff4",
        "EU",
        2500.0,
        1250.0,
        "Monatlich",
        "EUR",
        6,
        "Vorschlag4",
        "User4",
        "Kunde",
        "View4",
        "Agent4",
        False,
        "Case4",
        False,
        "App4",
        "SalesPoint4",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 14:30:00"),
        "Antrag Start",
        "Tariff5",
        "BU",
        3000.0,
        1500.0,
        "Jährlich",
        "EUR",
        7,
        "Vorschlag5",
        "User5",
        "Berater",
        "View5",
        "Agent5",
        True,
        "Case5",
        True,
        "App5",
        "SalesPoint5",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 15:45:00"),
        "Antrag (E-Sign)",
        "Tariff6",
        "EU",
        3500.0,
        1750.0,
        "Monatlich",
        "EUR",
        8,
        "Vorschlag6",
        "User6",
        "Kunde",
        "View6",
        "Agent6",
        False,
        "Case6",
        False,
        "App6",
        "SalesPoint6",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 17:00:00"),
        "Geloescht",
        "Tariff7",
        "BU",
        4000.0,
        2000.0,
        "Jährlich",
        "EUR",
        9,
        "Vorschlag7",
        "User7",
        "Berater",
        "View7",
        "Agent7",
        True,
        "Case7",
        True,
        "App7",
        "SalesPoint7",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 18:15:00"),
        "Antrag (Papier)",
        "Tariff8",
        "EU",
        4500.0,
        2250.0,
        "Monatlich",
        "EUR",
        10,
        "Vorschlag8",
        "User8",
        "Kunde",
        "View8",
        "Agent8",
        False,
        "Case8",
        False,
        "App8",
        "SalesPoint8",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 19:30:00"),
        "Laden",
        "Tariff9",
        "BU",
        5000.0,
        2500.0,
        "Jährlich",
        "EUR",
        11,
        "Vorschlag9",
        "User9",
        "Berater",
        "View9",
        "Agent9",
        True,
        "Case9",
        True,
        "App9",
        "SalesPoint9",
    ),
    (
        "ID2",
        str_to_timestamp("2024-04-27 20:45:00"),
        "Sync",
        "Tariff10",
        "EU",
        5500.0,
        2750.0,
        "Monatlich",
        "EUR",
        12,
        "Vorschlag10",
        "User10",
        "Kunde",
        "View10",
        "Agent10",
        False,
        "Case10",
        False,
        "App10",
        "SalesPoint10",
    ),
]

data_2 = [
    (
        "ID1",
        str_to_timestamp("2024-04-27 08:30:00"),
        "Berechnung",
        "Tariff1",
        "BU",
        1000.0,
        500.0,
        "Jährlich",
        "EUR",
        3,
        "Vorschlag1",
        "User1",
        "Berater",
        "View1",
        "Agent1",
        True,
        "Case1",
        True,
        "App1",
        "SalesPoint1",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 10:45:00"),
        "Antrag (Papier)",
        "Tariff2",
        "EU",
        1500.0,
        750.0,
        "Monatlich",
        "EUR",
        4,
        "Vorschlag2",
        "User2",
        "Kunde",
        "View2",
        "Agent2",
        False,
        "Case2",
        False,
        "App2",
        "SalesPoint2",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 12:00:00"),
        "Laden",
        "Tariff3",
        "BU",
        2000.0,
        1000.0,
        "Jährlich",
        "EUR",
        5,
        "Vorschlag3",
        "User3",
        "Berater",
        "View3",
        "Agent3",
        True,
        "Case3",
        True,
        "App3",
        "SalesPoint3",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 13:15:00"),
        "Sync",
        "Tariff4",
        "EU",
        2500.0,
        1250.0,
        "Monatlich",
        "EUR",
        6,
        "Vorschlag4",
        "User4",
        "Kunde",
        "View4",
        "Agent4",
        False,
        "Case4",
        False,
        "App4",
        "SalesPoint4",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 14:30:00"),
        "Antrag Start",
        "Tariff5",
        "BU",
        3000.0,
        1500.0,
        "Jährlich",
        "EUR",
        7,
        "Vorschlag5",
        "User5",
        "Berater",
        "View5",
        "Agent5",
        True,
        "Case5",
        True,
        "App5",
        "SalesPoint5",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 15:45:00"),
        "Antrag (E-Sign)",
        "Tariff6",
        "EU",
        3500.0,
        1750.0,
        "Monatlich",
        "EUR",
        8,
        "Vorschlag6",
        "User6",
        "Kunde",
        "View6",
        "Agent6",
        False,
        "Case6",
        False,
        "App6",
        "SalesPoint6",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 17:00:00"),
        "Geloescht",
        "Tariff7",
        "BU",
        4000.0,
        2000.0,
        "Jährlich",
        "EUR",
        9,
        "Vorschlag7",
        "User7",
        "Berater",
        "View7",
        "Agent7",
        True,
        "Case7",
        True,
        "App7",
        "SalesPoint7",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 18:15:00"),
        "Antrag (Papier)",
        "Tariff8",
        "EU",
        4500.0,
        2250.0,
        "Monatlich",
        "EUR",
        10,
        "Vorschlag8",
        "User8",
        "Kunde",
        "View8",
        "Agent8",
        False,
        "Case8",
        False,
        "App8",
        "SalesPoint8",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 19:30:00"),
        "Laden",
        "Tariff9",
        "BU",
        5000.0,
        2500.0,
        "Jährlich",
        "EUR",
        11,
        "Vorschlag9",
        "User9",
        "Berater",
        "View9",
        "Agent9",
        True,
        "Case9",
        True,
        "App9",
        "SalesPoint9",
    ),
    (
        "ID2",
        str_to_timestamp("2024-04-27 20:45:00"),
        "Sync",
        "Tariff10",
        "EU",
        5500.0,
        2750.0,
        "Monatlich",
        "EUR",
        12,
        "Vorschlag10",
        "User10",
        "Kunde",
        "View10",
        "Agent10",
        False,
        "Case10",
        False,
        "App10",
        "SalesPoint10",
    ),
]
# Create DataFrame
df_1 = spark.createDataFrame(data_1, schema)

df_2 = spark.createDataFrame(data_2, schema)

**Data Description:**

**1. VorgangsID:**

- This column comprises identifiers for distinct processes, where each process may consist of multiple rows within the dataset.

**2. Datum:**

- Indicates the date and time for each event (Funktion) associated with a particular process (VorgangsID). The date is formatted as `YYYY-MM-DD HH:MM`.

**3. Funktion:**

- Represents various events within each process (VorgangsID), with unique values including:
  - `Antrag (E-Sign)`
  - `Antrag (Papier)`
  - `Antrag (VE)`
  - `Remote E-Sign`
  - `Antrag Eingereicht`
  - `Antrag Start`
  - `Antrag Vollstaendig`
  - `Berechnung`
  - `Erstellt`
  - `Geloescht`
  - `Laden`
  - `Sync`
  - `Vorschlag`

**4. Tarifname:**

- Contains the name of the tariff associated with each process.

**5. Sparte:**

- Indicates the category of insurance, with values such as:
  - `BU`
  - `BUZ`
  - `EU`
  - `EUZ`
  - `KLV`
  - `KLVZ`
  - `PKV`
  - `PKVZ`
  - `RIS`
  - `RISZ`
  - `RUV`
  - `RUVZ`
  - `SBU`

**6. Betrag:**

- Denotes the amount associated with each process.

**7. Beitragssumme:**

- Represents the sum of contributions related to each process.

**8. Zahlweise:**

- Indicates the payment frequency, with values including:
  - `Einmal`
  - `Jährlich`
  - `Monatlich`
  - `Quartal`
  - `Sofort`
  - `Zweijährlich`

**9. Waehrung:**

- Almost exclusively contains the value `EUR` denoting currency.

**10. Bewertungsstuffe:** - Contains numeric values ranging from 0 to 5.

**11. Vorschlagsnummer:** - Contains identifiers for proposals associated with processes.

**12. Benutzername:** - Contains the name of the user associated with each process.

**13. Benutzergruppe:** - Represents the user group, with possible values including: - `Berater` - `Kunde` - `Kundenberater` - `Kundenbetreuer` - `Kundenbetreuerin`

**14. SichtenID:** - Contains identifiers for views related to processes.

**15. IDzurAggtNr:** - Contains identifiers for agents associated with processes.

**16. KZ_Risikoprüfung:** - Represents a boolean value indicating risk assessment.

**17. AIS_CaseID:** - Contains identifiers for health checks within life insurance processes.

**18. EsignFaehig:** - Represents a boolean value indicating electronic signature capability.

**19. Antragsnummer:** - Contains identifiers for applications associated with processes.

**20. Vertriebsstelle:** - Contains identifiers for sales points associated with processes.


In [7]:
from datetime import datetime


# Define a function to convert string to timestamp
def str_to_timestamp(str_date):
    return datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S")


# Define schema for the DataFrame
schema = StructType(
    [
        StructField("VorgangsID", StringType(), True),
        StructField("Datum", TimestampType(), True),
        StructField("Funktion", StringType(), True),
        StructField("Tarifname", StringType(), True),
        StructField("Sparte", StringType(), True),
        StructField("Betrag", DoubleType(), True),
        StructField("Beitragssumme", DoubleType(), True),
        StructField("Zahlweise", StringType(), True),
        StructField("Waehrung", StringType(), True),
        StructField("Bewertungsstuffe", IntegerType(), True),
        StructField("Vorschlagsnummer", StringType(), True),
        StructField("Benutzername", StringType(), True),
        StructField("Benutzergruppe", StringType(), True),
        StructField("SichtenID", StringType(), True),
        StructField("IDzurAggtNr", StringType(), True),
        StructField("KZ_Risikoprüfung", BooleanType(), True),
        StructField("AIS_CaseID", StringType(), True),
        StructField("EsignFaehig", BooleanType(), True),
        StructField("Antragsnummer", StringType(), True),
        StructField("Vertriebsstelle", StringType(), True),
    ]
)

# Sample data
data = [
    (
        "ID1",
        str_to_timestamp("2024-04-27 08:30:00"),
        "Antrag Start",
        "Tariff1",
        "BU",
        1000.0,
        500.0,
        "Jährlich",
        "EUR",
        3,
        "Vorschlag1",
        "User1",
        "Berater",
        "View1",
        "Agent1",
        True,
        "Case1",
        True,
        "App1",
        "SalesPoint1",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 08:31:00"),
        "Berechnung",
        "Tariff2",
        "EU",
        1500.0,
        750.0,
        "Monatlich",
        "EUR",
        4,
        "Vorschlag2",
        "User2",
        "Kunde",
        "View2",
        "Agent2",
        False,
        "Case2",
        False,
        "App2",
        "SalesPoint2",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 12:00:00"),
        "Laden",
        "Tariff3",
        "BU",
        2000.0,
        1000.0,
        "Jährlich",
        "EUR",
        5,
        "Vorschlag3",
        "User3",
        "Berater",
        "View3",
        "Agent3",
        True,
        "Case3",
        True,
        "App3",
        "SalesPoint3",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 13:15:00"),
        "Sync",
        "Tariff4",
        "EU",
        2500.0,
        1250.0,
        "Monatlich",
        "EUR",
        6,
        "Vorschlag4",
        "User4",
        "Kunde",
        "View4",
        "Agent4",
        False,
        "Case4",
        False,
        "App4",
        "SalesPoint4",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 14:30:00"),
        "Antrag Start",
        "Tariff5",
        "BU",
        3000.0,
        1500.0,
        "Jährlich",
        "EUR",
        7,
        "Vorschlag5",
        "User5",
        "Berater",
        "View5",
        "Agent5",
        True,
        "Case5",
        True,
        "App5",
        "SalesPoint5",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 15:45:00"),
        "Antrag (E-Sign)",
        "Tariff6",
        "EU",
        3500.0,
        1750.0,
        "Monatlich",
        "EUR",
        8,
        "Vorschlag6",
        "User6",
        "Kunde",
        "View6",
        "Agent6",
        False,
        "Case6",
        False,
        "App6",
        "SalesPoint6",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 17:00:00"),
        "Geloescht",
        "Tariff7",
        "BU",
        4000.0,
        2000.0,
        "Jährlich",
        "EUR",
        9,
        "Vorschlag7",
        "User7",
        "Berater",
        "View7",
        "Agent7",
        True,
        "Case7",
        True,
        "App7",
        "SalesPoint7",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 18:15:00"),
        "Antrag (Papier)",
        "Tariff8",
        "EU",
        4500.0,
        2250.0,
        "Monatlich",
        "EUR",
        10,
        "Vorschlag8",
        "User8",
        "Kunde",
        "View8",
        "Agent8",
        False,
        "Case8",
        False,
        "App8",
        "SalesPoint8",
    ),
    (
        "ID1",
        str_to_timestamp("2024-04-27 19:30:00"),
        "Laden",
        "Tariff9",
        "BU",
        5000.0,
        2500.0,
        "Jährlich",
        "EUR",
        11,
        "Vorschlag9",
        "User9",
        "Berater",
        "View9",
        "Agent9",
        True,
        "Case9",
        True,
        "App9",
        "SalesPoint9",
    ),
    (
        "ID2",
        str_to_timestamp("2024-04-27 20:45:00"),
        "Sync",
        "Tariff10",
        "EU",
        5500.0,
        2750.0,
        "Monatlich",
        "EUR",
        12,
        "Vorschlag10",
        "User10",
        "Kunde",
        "View10",
        "Agent10",
        False,
        "Case10",
        False,
        "App10",
        "SalesPoint10",
    ),
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show DataFrame
df.show()

+----------+-------------------+---------------+---------+------+------+-------------+---------+--------+----------------+----------------+------------+--------------+---------+-----------+----------------+----------+-----------+-------------+---------------+
|VorgangsID|              Datum|       Funktion|Tarifname|Sparte|Betrag|Beitragssumme|Zahlweise|Waehrung|Bewertungsstuffe|Vorschlagsnummer|Benutzername|Benutzergruppe|SichtenID|IDzurAggtNr|KZ_Risikoprüfung|AIS_CaseID|EsignFaehig|Antragsnummer|Vertriebsstelle|
+----------+-------------------+---------------+---------+------+------+-------------+---------+--------+----------------+----------------+------------+--------------+---------+-----------+----------------+----------+-----------+-------------+---------------+
|       ID1|2024-04-27 08:30:00|   Antrag Start|  Tariff1|    BU|1000.0|        500.0| Jährlich|     EUR|               3|      Vorschlag1|       User1|       Berater|    View1|     Agent1|            true|     Case1|   

# Event Analysis


In [30]:
# How often does each Event (Funktion) occur for each VorgangsID on average

df_grouped = df.groupBy("VorgangsID", "Funktion").count()

df_renamed = df_grouped.withColumnRenamed("count", "event_count")

df_avg = df_renamed.groupBy("Funktion").agg(
    F.avg("event_count").alias("avg_event_count")
)

df_ordered = df_avg.orderBy(F.desc("avg_event_count"))

df_ordered.show()

+---------------+---------------+
|       Funktion|avg_event_count|
+---------------+---------------+
|   Antrag Start|            2.0|
|          Laden|            2.0|
|      Geloescht|            1.0|
|Antrag (Papier)|            1.0|
|Antrag (E-Sign)|            1.0|
|           Sync|            1.0|
|     Berechnung|            1.0|
+---------------+---------------+



In [32]:
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql import functions as F


def calculate_throughput_time(df: DataFrame, event: str) -> DataFrame:
    # Define window specification
    window_spec = Window.partitionBy("VorgangsID").orderBy("Datum")

    # Filter DataFrame based on event
    df_filtered = df.filter(F.col("Funktion") == event)

    # Add min_date and max_date columns
    df_filtered = (
        df_filtered.withColumn("min_date", F.min("Datum").over(window_spec))
        .withColumn("max_date", F.max("Datum").over(window_spec))
        .select("VorgangsID", "Datum", "Funktion", "min_date", "max_date")
    )

    # Calculate time difference in seconds
    df_filtered = df_filtered.withColumn(
        "time_diff", F.unix_timestamp("max_date") - F.unix_timestamp("min_date")
    )

    # Convert time difference to minutes, hours, and days
    df_filtered = df_filtered.withColumn("time_diff_minutes", F.col("time_diff") / 60)
    df_filtered = df_filtered.withColumn("time_diff_hours", F.col("time_diff") / 3600)
    df_filtered = df_filtered.withColumn(
        "time_diff_days", F.col("time_diff") / (3600 * 24)
    )

    return df_filtered


# Get distinct values from the "Funktion" column
distinct_events = df.select("Funktion").distinct().rdd.flatMap(lambda x: x).collect()

# Create a dropdown widget with the distinct values
dbutils.widgets.dropdown("event", distinct_events[0], distinct_events)

# Get the selected event from the widget
selected_event = dbutils.widgets.get("event")

# Use the selected event in the function
df_filtered = calculate_throughput_time(df, selected_event)

# Show the average time difference
df_filtered.groupBy().avg("time_diff").show()

NameError: name 'dbutils' is not defined

In [27]:
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql import functions as F


def calculate_time_diff(df: DataFrame, event1: str, event2: str) -> DataFrame:
    # Define window specification
    window_spec = Window.partitionBy("VorgangsID").orderBy("Datum")

    # Filter DataFrame based on event1 and event2
    df_filtered = df.filter(F.col("Funktion").isin([event1, event2]))

    # Calculate the time difference between the first occurrence of event1 and the first occurrence of event2
    df_filtered = (
        df_filtered.withColumn(
            "min_date_event1",
            F.min(F.when(F.col("Funktion") == event1, F.col("Datum"))).over(
                window_spec
            ),
        )
        .withColumn(
            "min_date_event2",
            F.min(F.when(F.col("Funktion") == event2, F.col("Datum"))).over(
                window_spec
            ),
        )
        .select("VorgangsID", "Datum", "Funktion", "min_date_event1", "min_date_event2")
    )

    df_filtered = df_filtered.withColumn(
        "time_diff",
        F.unix_timestamp("min_date_event2") - F.unix_timestamp("min_date_event1"),
    )

    # Convert time difference to minutes, hours, and days
    df_filtered = df_filtered.withColumn("time_diff_minutes", F.col("time_diff") / 60)
    df_filtered = df_filtered.withColumn("time_diff_hours", F.col("time_diff") / 3600)
    df_filtered = df_filtered.withColumn(
        "time_diff_days", F.col("time_diff") / (3600 * 24)
    )

    return df_filtered


# Get distinct values from the "Funktion" column
distinct_events = df.select("Funktion").distinct().rdd.flatMap(lambda x: x).collect()

# Create dropdown widgets for event1 and event2
dbutils.widgets.dropdown("event1", distinct_events[0], distinct_events)
dbutils.widgets.dropdown("event2", distinct_events[0], distinct_events)

# Get the selected events from the widgets
selected_event1 = dbutils.widgets.get("event1")
selected_event2 = dbutils.widgets.get("event2")

# Use the selected events in the function
df_filtered = calculate_time_diff(df, selected_event1, selected_event2)

# Show the average time difference
df_filtered.groupBy().avg("time_diff").show()

+--------------+
|avg(time_diff)|
+--------------+
|       -9000.0|
+--------------+



In [33]:
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql import functions as F


def calculate_avg_event_count(df: DataFrame) -> DataFrame:
    # Define window specification
    window_spec = Window.partitionBy("VorgangsID").orderBy("Datum")

    # Add a new column to the DataFrame that contains the previous event
    df = df.withColumn("previous_event", F.lag("Funktion").over(window_spec))

    # Add a new column to the DataFrame that contains the next event
    df = df.withColumn("next_event", F.lead("Funktion").over(window_spec))

    # Filter the DataFrame to include only rows where the previous event is the same as the current event
    # or the next event is the same as the current event
    df_filtered = df.filter(
        (F.col("Funktion") == F.col("previous_event"))
        | (F.col("Funktion") == F.col("next_event"))
    )

    # Calculate the average count of consecutive or simultaneous occurrences of the same event for each VorgangsID
    df_avg_event_count = df_filtered.groupBy("VorgangsID", "Funktion").count().withColumnRenamed(
        "count", "event_count"
    ).groupBy("Funktion").agg(
        F.avg("event_count").alias("avg_loop_count_per_case")
    ).orderBy(F.desc("avg_loop_count_per_case"))

    return df_avg_event_count


# Usage
df_avg_event_count = calculate_avg_event_count(df)
df_avg_event_count.show()

+--------+-----------------------+
|Funktion|avg_loop_count_per_case|
+--------+-----------------------+
+--------+-----------------------+



In [34]:
# close the session
spark.stop()