In [38]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.getOrCreate()

## Import Libraries


In [2]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import DataFrame
from typing import Union, List, Dict, Tuple, Optional
from pyspark.sql.column import Column
from pyspark.sql.functions import broadcast

In [40]:
# Create a sample DataFrame
data = [
    ("A1", "2022-01-23 08:00", "Antrag Start", "Start"),
    ("A1", "2022-01-23 08:10", "Fristablauf", "BU"),
    ("A1", "2022-01-23 08:15", "Vorschlag", None),
    ("A2", "2022-01-24 10:00", "Antrag Start", None),
    ("A2", "2022-01-23 10:20", "Sync", "Sync Tarif"),
]

columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
df = spark.createDataFrame(data, columns)

# Distinct values in a dataframe


In [41]:
def distinct_values_with_first_appearance(
    df: DataFrame, first_date_col: str, columns_to_show: Union[str, List[str]]
) -> Dict[str, DataFrame]:
    """
    Compute distinct values for each column in the DataFrame along with the first appearance date of each value.

    Args:
        df (DataFrame): The input DataFrame.
        first_date_col (str): The name of the column representing the date of first appearance.
        columns_to_show (Union[str, List[str]]): Columns to display. It can be 'all', a list of column names, or a single column name.

    Returns:
        Dict[str, DataFrame]: A dictionary with column names as keys and DataFrames as values containing distinct values and their first appearance dates.

    Raises:
        ValueError: If `columns_to_show` is neither 'all', a column name, nor a list of column names.

    Examples:
        >>> df = spark.createDataFrame([
        ...     ("A1", "2022-01-23 08:00", "Antrag Start"),
        ...     ("A1", "2022-01-23 08:10", "Fristablauf"),
        ...     ("A2", "2022-01-23 10:00", "Antrag Start"),
        ...     ("A2", "2022-01-23 10:20", "Sync")
        ... ], ["CASE_KEY", "Datum", "Funktion"])
        >>> result = distinct_values_with_first_appearance(df, "Datum", "all")
        >>> result["CASE_KEY"].show()
        >>> result["Funktion"].show()
    """

    if not isinstance(columns_to_show, (str, list)):
        raise ValueError(
            "`columns_to_show` must be 'all', a column name, or a list of column names."
        )

    columns = [col for col in df.columns if col != first_date_col]

    result = {}
    for column in columns:
        distinct_df = df.select(column).distinct()
        first_date_df = df.groupBy(column).agg(
            F.min(first_date_col).alias(f"{column}_first_date")
        )
        distinct_df = distinct_df.join(first_date_df, on=column, how="left")
        result[column] = distinct_df

    if columns_to_show == "all":
        columns_to_display = columns
    elif isinstance(columns_to_show, str):
        columns_to_display = [columns_to_show]
    else:
        columns_to_display = columns_to_show

    for column in columns_to_display:
        if column in result:
            print(f"Column: {column}")
            result[column].show()
        else:
            raise ValueError(f"Column `{column}` not found in the DataFrame.")

    return result

In [42]:
# Call the function and show all columns
result = distinct_values_with_first_appearance(df, "Datum", "all")

# Accessing individual results
result["CASE_KEY"].show()
result["Funktion"].show()
result["Tarifname"].show()

# Specify a list of columns to show
columns_to_display = ["CASE_KEY", "Funktion"]

# Call the function and show the specified columns
result = distinct_values_with_first_appearance(df, "Datum", columns_to_display)

# Accessing individual results
result["CASE_KEY"].show()
result["Funktion"].show()


# Specify a single column to show
column_to_display = "Funktion"

# Call the function and show the specified column
result = distinct_values_with_first_appearance(df, "Datum", column_to_display)

# Accessing the result
result["Funktion"].show()


try:
    # Call the function with an invalid column name
    distinct_values_with_first_appearance(df, "Datum", "InvalidColumn")
except ValueError as e:
    print(e)

try:
    # Call the function with an invalid type for columns_to_show
    distinct_values_with_first_appearance(df, "Datum", 123)
except ValueError as e:
    print(e)

# Call the function and return the results
result = distinct_values_with_first_appearance(
    df, "Datum", ["CASE_KEY", "Funktion"])

# Further processing on the returned DataFrames
# For example, count the number of distinct values for each column
case_key_count = result["CASE_KEY"].count()
funktion_count = result["Funktion"].count()

print(f"Number of distinct CASE_KEY values: {case_key_count}")
print(f"Number of distinct Funktion values: {funktion_count}")

# Show all columns in the result at once using a loop
for column, col_df in result.items():
    print(f"Column: {column}")
    col_df.show()

Column: CASE_KEY


                                                                                

+--------+-------------------+
|CASE_KEY|CASE_KEY_first_date|
+--------+-------------------+
|      A1|   2022-01-23 08:00|
|      A2|   2022-01-23 10:20|
+--------+-------------------+

Column: Funktion
+------------+-------------------+
|    Funktion|Funktion_first_date|
+------------+-------------------+
|Antrag Start|   2022-01-23 08:00|
| Fristablauf|   2022-01-23 08:10|
|   Vorschlag|   2022-01-23 08:15|
|        Sync|   2022-01-23 10:20|
+------------+-------------------+

Column: Tarifname
+----------+--------------------+
| Tarifname|Tarifname_first_date|
+----------+--------------------+
|     Start|    2022-01-23 08:00|
|        BU|    2022-01-23 08:10|
|      null|                null|
|Sync Tarif|    2022-01-23 10:20|
+----------+--------------------+

+--------+-------------------+
|CASE_KEY|CASE_KEY_first_date|
+--------+-------------------+
|      A1|   2022-01-23 08:00|
|      A2|   2022-01-23 10:20|
+--------+-------------------+

+------------+-------------------+
| 

In [43]:
def distinct_values(
    df: DataFrame, columns_to_show: Union[str, List[str]]
) -> Dict[str, DataFrame]:
    """
    Compute distinct values for each column in the DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        columns_to_show (Union[str, List[str]]): Columns to display. It can be 'all', a list of column names, or a single column name.

    Returns:
        Dict[str, DataFrame]: A dictionary with column names as keys and DataFrames as values containing distinct values.

    Raises:
        ValueError: If `columns_to_show` is neither 'all', a column name, nor a list of column names.

    Examples:
        >>> df = spark.createDataFrame([
        ...     ("A1", "2022-01-23 08:00", "Antrag Start", "Start"),
        ...     ("A1", "2022-01-23 08:10", "Fristablauf ext.", "BU"),
        ...     ("A1", "2022-01-23 08:15", "Vorschlag", None),
        ...     ("A2", "2022-01-23 10:00", "Antrag Start", None),
        ...     ("A2", "2022-01-23 10:20", "Sync", "Sync Tarif")
        ... ], ["CASE_KEY", "Datum", "Funktion", "Tarifname"])
        >>> result = distinct_values(df, "all")
        >>> result["CASE_KEY"].show()
        >>> result["Funktion"].show()
        >>> result["Tarifname"].show()
    """

    if not isinstance(columns_to_show, (str, list)):
        raise ValueError(
            "`columns_to_show` must be 'all', a column name, or a list of column names."
        )

    result = {}
    columns = df.columns

    for column in columns:
        distinct_df = df.select(column).distinct()
        result[column] = distinct_df

    if columns_to_show == "all":
        columns_to_display = columns
    elif isinstance(columns_to_show, str):
        columns_to_display = [columns_to_show]
    else:
        columns_to_display = columns_to_show

    for column in columns_to_display:
        if column in result:
            print(f"Column: {column}")
            result[column].show()
        else:
            raise ValueError(f"Column `{column}` not found in the DataFrame.")

    return result

In [45]:
# Call the function and show all columns
result = distinct_values(df, "all")

# Accessing individual results
result["CASE_KEY"].show()
result["Funktion"].show()
result["Tarifname"].show()

+--------+
|CASE_KEY|
+--------+
|      A1|
|      A2|
+--------+

+------------+
|    Funktion|
+------------+
|Antrag Start|
| Fristablauf|
|   Vorschlag|
|        Sync|
+------------+

+----------+
| Tarifname|
+----------+
|     Start|
|        BU|
|      null|
|Sync Tarif|
+----------+



# Working with Null


In [46]:
def null_value_distribution_over_time(
    df: DataFrame,
    date_col: str,
    columns: Union[str, List[str]] = "all",
    timeframe: str = "day",
) -> DataFrame:
    """
    Compute the distribution of null values over time for specified columns with different timeframes.

    Args:
        df (DataFrame): The input DataFrame.
        date_col (str): The name of the date column.
        columns (Union[str, List[str]]): The column(s) to check for null values. It can be 'all', a list of column names, or a single column name.
        timeframe (str): The timeframe for aggregation ('year', 'month', 'day').

    Returns:
        DataFrame: A DataFrame showing the distribution of null values over time.

    Raises:
        ValueError: If `timeframe` is not one of 'year', 'month', or 'day'.
    """

    # Ensure columns is a list of relevant columns
    if columns == "all":
        columns = [col for col in df.columns if col != date_col]
    elif isinstance(columns, str):
        columns = [columns]

    # Define the date format based on the timeframe
    if timeframe == "year":
        df = df.withColumn("timeframe", F.year(F.col(date_col)).cast("string"))
    elif timeframe == "month":
        df = df.withColumn("timeframe", F.date_format(F.col(date_col), "yyyy-MM"))
    elif timeframe == "day":
        df = df.withColumn("timeframe", F.date_format(F.col(date_col), "yyyy-MM-dd"))
    else:
        raise ValueError(
            "Invalid value for timeframe. It should be 'year', 'month', or 'day'."
        )

    # Create a DataFrame with counts of null and non-null values for each column over time
    null_distributions = []
    for column in columns:
        null_count = df.withColumn(
            f"{column}_is_null", F.col(column).isNull().cast("int")
        )

        # Group by timeframe and aggregate counts of null and non-null values
        null_distribution = null_count.groupBy("timeframe").agg(
            F.sum(f"{column}_is_null").alias(f"{column}_null_count"),
            F.count(f"{column}_is_null").alias(f"{column}_total_count"),
            (F.count(f"{column}_is_null") - F.sum(f"{column}_is_null")).alias(
                f"{column}_non_null_count"
            ),
        )

        null_distributions.append(null_distribution)

    # Join all the null distributions on the timeframe column
    result_df = null_distributions[0]
    for dist_df in null_distributions[1:]:
        result_df = result_df.join(dist_df, on="timeframe", how="outer")

    return result_df

In [47]:
# Get null value distribution for the "Tarifname" column by day
result_df = null_value_distribution_over_time(
    df, "Datum", "Tarifname", timeframe="month"
)
result_df.show()

# Get null value distribution for the "Funktion" and "Tarifname" columns by month
result_df = null_value_distribution_over_time(
    df, "Datum", ["Funktion", "Tarifname"], timeframe="day"
)
result_df.show()

# Get null value distribution for the "CASE_KEY" and "Tarifname" columns by year
result_df = null_value_distribution_over_time(df, "Datum", "all", timeframe="year")
result_df.show()

+---------+--------------------+---------------------+------------------------+
|timeframe|Tarifname_null_count|Tarifname_total_count|Tarifname_non_null_count|
+---------+--------------------+---------------------+------------------------+
|  2022-01|                   2|                    5|                       3|
+---------+--------------------+---------------------+------------------------+

+----------+-------------------+--------------------+-----------------------+--------------------+---------------------+------------------------+
| timeframe|Funktion_null_count|Funktion_total_count|Funktion_non_null_count|Tarifname_null_count|Tarifname_total_count|Tarifname_non_null_count|
+----------+-------------------+--------------------+-----------------------+--------------------+---------------------+------------------------+
|2022-01-23|                  0|                   4|                      4|                   1|                    4|                       3|
|2022-01-24|   

# Rename multiple columns


In [48]:
def rename_columns(df: DataFrame, column_mapping: Dict[str, str]) -> DataFrame:
    """
    Rename columns of a DataFrame according to a provided mapping dictionary.

    Args:
        df (DataFrame): The input DataFrame whose columns need to be renamed.
        column_mapping (Dict[str, str]): A dictionary where the keys are the current column names
                                         and the values are the new column names.

    Returns:
        DataFrame: The DataFrame with renamed columns.

    Example:
        >>> spark = SparkSession.builder.appName("Example").getOrCreate()
        >>> data = [("A1", "2022-01-23 08:00:00", "Antrag Start", "Start"),
        ...         ("A2", "2022-01-23 10:00:00", "Antrag Start", "Start")]
        >>> columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
        >>> df = spark.createDataFrame(data, columns)
        >>> column_mapping = {"CASE_KEY": "case_id", "Datum": "timestamp"}
        >>> df_renamed = rename_columns(df, column_mapping)
        >>> df_renamed.show()
        +-------+-------------------+-------------+---------+
        |case_id|          timestamp|     Funktion|Tarifname|
        +-------+-------------------+-------------+---------+
        |     A1|2022-01-23 08:00:00|Antrag Start|    Start |
        |     A2|2022-01-23 10:00:00|Antrag Start|    Start |
        +-------+-------------------+-------------+---------+
    """
    # Validate that all keys in column_mapping exist in the DataFrame's columns
    for old_name in column_mapping.keys():
        if old_name not in df.columns:
            raise ValueError(f"Column '{old_name}' does not exist in the DataFrame.")

    # Rename the columns according to the mapping
    for old_name, new_name in column_mapping.items():
        df = df.withColumnRenamed(old_name, new_name)

    return df

In [49]:
df = spark.createDataFrame(
    [
        ("A1", "2022-01-23 08:00", "Antrag Start", "Start"),
        ("A2", "2022-01-23 10:00", "Antrag Start", "Start"),
    ],
    ["CASE_KEY", "Datum", "Funktion", "Tarifname"],
)


# Define the column mapping
column_mapping = {
    "CASE_KEY": "case_id",
    "Datum": "timestamp",
    "Funktion": "function",
    "Tarifname": "tariff_name",
}

# Rename the columns of the DataFrame
df_renamed = rename_columns(df, column_mapping)

# Show the DataFrame with renamed columns
df_renamed.show()

+-------+----------------+------------+-----------+
|case_id|       timestamp|    function|tariff_name|
+-------+----------------+------------+-----------+
|     A1|2022-01-23 08:00|Antrag Start|      Start|
|     A2|2022-01-23 10:00|Antrag Start|      Start|
+-------+----------------+------------+-----------+



# Rename Column values


In [9]:
def rename_column_values(
    df: DataFrame,
    column_name: str,
    value_mapping: Dict[str, str],
    conditions: Optional[Dict[str, Column]] = None,
) -> DataFrame:
    """
    Rename values in a specified column of a DataFrame based on a provided mapping dictionary,
    and optionally apply additional conditions to set specific values.

    Args:
        df (DataFrame): The input DataFrame.
        column_name (str): The name of the column whose values need to be renamed.
        value_mapping (Dict[str, str]): A dictionary where the keys are the current values
                                        and the values are the new values.
        conditions (Optional[Dict[str, Column]]): An optional dictionary where the keys are the values to be set
                                                  and the values are the conditions to be met.

    Returns:
        DataFrame: The DataFrame with renamed values in the specified column.

    Example:
        >>> from pyspark.sql.functions import col
        >>> spark = SparkSession.builder.appName("Example").getOrCreate()
        >>> data = [("A1", "Start", "Tarif1"), ("A2", "Process", None), ("A3", "End", "Tarif2")]
        >>> columns = ["CASE_KEY", "Funktion", "Tarif"]
        >>> df = spark.createDataFrame(data, columns)
        >>> value_mapping = {"Start": "Begin", "Process": "In Progress", "End": "Complete"}
        >>> conditions = {"SpecialValue": (col("Funktion").like("%Antrag%") & (col("CASE_KEY") == "A1")),
        ...               "TarifValue": col("Tarif").isNotNull()}
        >>> df_renamed = rename_column_values(df, "Funktion", value_mapping, conditions)
        >>> df_renamed.show()
        +--------+-----------+-------+
        |CASE_KEY|   Funktion|  Tarif|
        +--------+-----------+-------+
        |     A1|SpecialValue| Tarif1|
        |     A2| In Progress|   null|
        |     A3|   Complete| Tarif2|
        +--------+-----------+-------+
    """
    # Start with the initial column
    updated_column = F.col(column_name)

    # Apply conditions if provided
    if conditions:
        for new_value, condition in conditions.items():
            updated_column = F.when(
                condition, new_value).otherwise(updated_column)

    # Apply the value mapping
    for old_value, new_value in value_mapping.items():
        updated_column = F.when(F.col(column_name) == old_value, new_value).otherwise(
            updated_column
        )

    # Update the DataFrame with the renamed values
    df = df.withColumn(column_name, updated_column)

    return df

In [55]:
# Sample data
data = [("A1", "Start", "Tarif1"), ("A2", "Process", None), ("A3", "End", "Tarif2")]
columns = ["CASE_KEY", "Funktion", "Tarif"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Value mapping dictionary
value_mapping = {"Start": "Begin", "Process": "In Progress", "End": "Complete"}

# Conditions dictionary
conditions = {
    "SpecialValue": (F.col("Funktion").like("%Antrag%")),
    "TarifValue": F.col("Tarif").isNotNull(),
}

# Rename column values with conditions
df_renamed_with_conditions = rename_column_values(
    df, "Funktion", value_mapping, conditions
)
df_renamed_with_conditions.show()

# Rename column values without conditions
df_renamed_without_conditions = rename_column_values(df, "Funktion", value_mapping)
df_renamed_without_conditions.show()

+--------+-----------+------+
|CASE_KEY|   Funktion| Tarif|
+--------+-----------+------+
|      A1|      Begin|Tarif1|
|      A2|In Progress|  null|
|      A3|   Complete|Tarif2|
+--------+-----------+------+

+--------+-----------+------+
|CASE_KEY|   Funktion| Tarif|
+--------+-----------+------+
|      A1|      Begin|Tarif1|
|      A2|In Progress|  null|
|      A3|   Complete|Tarif2|
+--------+-----------+------+



# Joining Multiple Dataframes


In [8]:
def join_dataframes(
    df1: DataFrame,
    df2: DataFrame,
    join_keys: Tuple[Union[str, List[str]], Union[str, List[str]]],
    join_type: str,
) -> DataFrame:
    """
    Join two DataFrames on the specified join keys with the chosen join type.

    Args:
        df1 (DataFrame): The first DataFrame.
        df2 (DataFrame): The second DataFrame.
        join_keys (Tuple[Union[str, List[str]], Union[str, List[str]]]): A tuple containing the join keys for df1 and df2.
        join_type (str): The type of join to perform. Options are 'inner', 'outer', 'left', 'right', 'semi', 'anti', etc.

    Returns:
        DataFrame: The joined DataFrame.

    Raises:
        ValueError: If the keys in the second DataFrame are not unique.

    Example:
        >>> from pyspark.sql import SparkSession
        >>> data1 = [("A1", 10), ("A2", 20)]
        >>> columns1 = ["CASE_KEY_1", "Value1"]
        >>> df1 = spark.createDataFrame(data1, columns1)
        >>> data2 = [("A1", "X"), ("A2", "Y")]
        >>> columns2 = ["CASE_KEY_2", "Value2"]
        >>> df2 = spark.createDataFrame(data2, columns2)
        >>> result = join_dataframes(df1, df2, ("CASE_KEY_1", "CASE_KEY_2"), "inner")
        >>> result.show()
        +-----------+------+-----------+
        |CASE_KEY_1 |Value1| CASE_KEY_2|Value2|
        +-----------+------+-----------+
        |      A1|    10|      A1|     X|
        |      A2|    20|      A2|     Y|
        +-----------+------+-----------+
    """
    join_key_df1, join_key_df2 = join_keys

    # Convert join keys to lists if they are strings
    if isinstance(join_key_df1, str):
        join_key_df1 = [join_key_df1]
    if isinstance(join_key_df2, str):
        join_key_df2 = [join_key_df2]

    # Check for uniqueness of join_key in df2
    unique_check = df2.groupBy(join_key_df2).count().filter(F.col("count") > 1)
    if unique_check.count() > 0:
        raise ValueError("The join keys in the second DataFrame are not unique.")

    # Resolve potential column name conflicts by renaming columns in df2
    df1_cols = set(df1.columns)
    df2_cols = set(df2.columns)
    common_cols = df1_cols.intersection(df2_cols).difference(
        set(join_key_df1).union(set(join_key_df2))
    )

    for F.col in common_cols:
        df2 = df2.withColumnRenamed(F.col, f"{F.col}_df2")

    # Create join condition
    join_condition = [df1[k1] == df2[k2] for k1, k2 in zip(join_key_df1, join_key_df2)]

    # Perform the join
    joined_df = df1.join(df2, on=join_condition, how=join_type)

    return joined_df

In [57]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Example").getOrCreate()

# Sample data for the first DataFrame
data1 = [("A1", 10), ("A2", 20)]
columns1 = ["CASE_KEY_1", "Value1"]

# Sample data for the second DataFrame
data2 = [("A1", "X"), ("A2", "Y")]
columns2 = ["CASE_KEY_2", "Value2"]

# Create DataFrames
df1 = spark.createDataFrame(data1, columns1)
df2 = spark.createDataFrame(data2, columns2)

# Join DataFrames
result = join_dataframes(df1, df2, ("CASE_KEY_1", "CASE_KEY_2"), "inner")

# Show the result
result.show()

+----------+------+----------+------+
|CASE_KEY_1|Value1|CASE_KEY_2|Value2|
+----------+------+----------+------+
|        A1|    10|        A1|     X|
|        A2|    20|        A2|     Y|
+----------+------+----------+------+



# Duplicate Checker


In [7]:
def check_unique_keys(
    df: DataFrame, keys: Union[str, List[str]], show_duplicates: bool = False
) -> str:
    """
    Check whether the combination of keys is unique in the DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        keys (Union[str, List[str]]): The key(s) to check for uniqueness. This can be a string for a single key or a list of column names.
        show_duplicates (bool): If True, shows the DataFrame with duplicate keys. Default is False.

    Returns:
        str: "Test passed!" if the combination of keys is unique, or "Test failed: Number of duplicates " followed by the number of duplicates.

    Raises:
        ValueError: If 'keys' is not a string or a list of strings, or if 'keys' are not columns in the DataFrame.

    Examples:
        >>> spark = SparkSession.builder.appName("Example").getOrCreate()
        >>> data = [("A1", "2022-01-23 08:00", "Antrag Start", "AT"),
        ...         ("A1", "2022-01-23 08:10", "Fristablauf ext.", "Signatul BU"),
        ...         ("A2", "2022-01-23 10:00", "Antrag Start", "")]
        >>> columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
        >>> df = spark.createDataFrame(data, schema=columns)
        >>> result = check_unique_keys(df, ["CASE_KEY", "Datum"])
        >>> print(result)
        Test passed!
    """

    # Validate input keys
    if isinstance(keys, str):
        keys = [keys]
    elif not isinstance(keys, list) or not all(isinstance(key, str) for key in keys):
        raise ValueError("'keys' must be a string or a list of strings")

    # Check if all keys are columns in the DataFrame
    missing_keys = [key for key in keys if key not in df.columns]
    if missing_keys:
        raise ValueError(
            f"The following keys are not columns in the DataFrame: {', '.join(missing_keys)}"
        )

    # Group by the keys and count the occurrences
    grouped_df = df.groupBy(keys).count()

    # Filter the groups with count > 1 to find duplicates
    duplicates_df = grouped_df.filter(col("count") > 1)
    num_duplicates = duplicates_df.count()

    if num_duplicates == 0:
        return "Test passed!"
    else:
        if show_duplicates:
            duplicates = df.join(duplicates_df.select(keys), on=keys, how="inner")
            duplicates.show()
        return f"Test failed: Number of duplicates {num_duplicates}"

# Filter Dataframe


In [5]:
def filter_dataframe(df: DataFrame, conditions: Dict[str, Column]) -> DataFrame:
    """
    Apply multiple conditions to filter a DataFrame.

    Args:
        df (DataFrame): The DataFrame to filter.
        conditions (Dict[str, Column]): A dictionary where the keys are condition names and the values are PySpark Column conditions.

    Returns:
        DataFrame: The filtered DataFrame.

    Raises:
        ValueError: If any of the conditions is not a PySpark Column.

    Examples:
        >>> from pyspark.sql.functions import col
        >>> data = [("A1", "2022-01-23 08:00", "Antrag Start", "AT"),
        ...         ("A1", "2022-01-23 08:10", "Fristablauf ext.", "Signatul BU"),
        ...         ("A2", "2022-01-23 10:00", "Antrag Start", "")]
        >>> columns = ["CASE_KEY", "Datum", "Funktion", "Tarifname"]
        >>> df = spark.createDataFrame(data, schema=columns)
        >>> conditions = {
        ...     "Tarifname not empty": col("Tarifname") != "",
        ...     "CASE_KEY A1": col("CASE_KEY") == "A1"
        ... }
        >>> filtered_df = filter_dataframe(df, conditions)
        >>> filtered_df.show()
        +--------+-------------------+--------------+-----------+
        |CASE_KEY|              Datum|      Funktion|  Tarifname|
        +--------+-------------------+--------------+-----------+
        |      A1|2022-01-23 08:00:00|  Antrag Start|         AT|
        |      A1|2022-01-23 08:10:00|Fristablauf ext.|Signatul BU|
        +--------+-------------------+--------------+-----------+
    """
    # Validate that all conditions are PySpark Column objects
    if not all(isinstance(condition, Column) for condition in conditions.values()):
        raise ValueError("All conditions must be PySpark Column objects")

    # Apply each condition to filter the DataFrame
    for condition in conditions.values():
        df = df.filter(condition)
    return df