In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, trim, ltrim, rtrim, regexp_replace
from typing import Union, List, Literal

# Initialize Spark session for testing
spark = SparkSession.builder.master("local").appName("TestWhitespaceTrim").getOrCreate()

# Function for trimming whitespace
def trim_whitespace(
    df: DataFrame,
    columns: Union[str, List[str]],
    trim_type: Literal["both", "left", "right"] = "both",
) -> DataFrame:
    """
    Trim whitespace from specified string columns and reduce multiple spaces between words to single spaces.

    Args:
        df (DataFrame): Input DataFrame.
        columns (Union[str, List[str]]): Column name or list of column names to trim.
        trim_type (Literal['both', 'left', 'right']): Type of trim operation.

    Returns:
        DataFrame: DataFrame with trimmed string columns and reduced spaces between words.
    """
    if trim_type not in ["both", "left", "right"]:
        raise ValueError("trim_type must be 'both', 'left', or 'right'")

    if isinstance(columns, str):
        columns = [columns]

    trim_func = {"both": trim, "left": ltrim, "right": rtrim}[trim_type]

    def trim_and_reduce_spaces(c):
        return regexp_replace(trim_func(col(c)), r"\s+", " ")

    trim_exprs = [
        trim_and_reduce_spaces(c).alias(c) if c in columns else col(c)
        for c in df.columns
    ]

    return df.select(*trim_exprs)

# Test dataset
test_data = [("  John   Doe  ", "  Jane    Smith  "), (" Alice    Wonderland  ", "  Bob   Builder   ")]
test_df = spark.createDataFrame(test_data, ["full_name1", "full_name2"])

# Apply the trim_whitespace function on the test data
trimmed_df = trim_whitespace(test_df, ["full_name1", "full_name2"])

# Display the result
trimmed_df.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/01 18:06:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]

+----------------+-----------+
|      full_name1| full_name2|
+----------------+-----------+
|        John Doe| Jane Smith|
|Alice Wonderland|Bob Builder|
+----------------+-----------+



                                                                                

In [4]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import coalesce, col
from functools import reduce


def first_non_null_value(dataframes, id_column, columns):
    """
    Given a list of PySpark DataFrames, returns a DataFrame with the first non-null value for specified columns.

    Parameters:
    dataframes (list of DataFrame): List of PySpark DataFrames to consider.
    id_column (str): The name of the column to join on (e.g., 'id').
    columns (list of str): List of column names for which to find the first non-null value.

    Returns:
    DataFrame: A PySpark DataFrame with the id_column and the first non-null values for specified columns.
    """
    # Check input types
    if not isinstance(dataframes, list) or not all(
        isinstance(df, DataFrame) for df in dataframes
    ):
        raise ValueError("dataframes should be a list of PySpark DataFrames")
    if not isinstance(id_column, str):
        raise ValueError("id_column should be a string")
    if not isinstance(columns, list) or not all(isinstance(c, str) for c in columns):
        raise ValueError("columns should be a list of strings")

    # Rename columns to avoid conflicts during join
    renamed_dataframes = []
    for idx, df in enumerate(dataframes):
        suffix = f"_df{idx+1}"
        renamed_cols = [col(id_column)]
        for c in columns:
            renamed_cols.append(col(c).alias(c + suffix))
        renamed_df = df.select(renamed_cols)
        renamed_dataframes.append(renamed_df)

    # Perform full outer join on the id_column
    joined_df = reduce(
        lambda df1, df2: df1.join(df2, on=id_column, how="full_outer"),
        renamed_dataframes,
    )

    # Coalesce columns to find the first non-null value
    coalesced_columns = [col(id_column)]
    for c in columns:
        col_list = [c + f"_df{idx+1}" for idx in range(len(dataframes))]
        coalesced_col = coalesce(*[col(name) for name in col_list]).alias(c)
        coalesced_columns.append(coalesced_col)

    # Select the final columns
    result_df = joined_df.select(coalesced_columns)
    return result_df

In [7]:
# Sample DataFrames
data1 = [(1, 10, "A10", "Apple"), (2, 20, "B20", "Banana"), (3, 30, "A10", "Apple")]
df1 = spark.createDataFrame(data1, ["id", "value", "VSNR", "Text"])

data2 = [(1, None, "A10", "Apple"), (2, 20, None, "Banana"), (3, 30, "A10", None)]
df2 = spark.createDataFrame(data2, ["id", "value", "VSNR", "Text"])

data3 = [
    (1, 10, None, "Apple"),
    (2, None, "B20", "Banana"),
    (3, 30, "A10", "Cheese"),
    (4, 20, None, "Cheese"),
]
df3 = spark.createDataFrame(data3, ["id", "value", "VSNR", "Text"])

# Applying the function
result_df = first_non_null_value([df1, df2, df3], "id", ["value", "VSNR", "Text"])

# Display the result
result_df.show()

+---+-----+----+------+
| id|value|VSNR|  Text|
+---+-----+----+------+
|  1|   10| A10| Apple|
|  2|   20| B20|Banana|
|  3|   30| A10| Apple|
|  4|   20|null|Cheese|
+---+-----+----+------+



In [11]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import coalesce, col, first
from functools import reduce


def enrich_with_first_non_null(basis_df, dataframes, id_column, columns):
    """
    Enriches the basis DataFrame with the first non-null values from other DataFrames.

    Parameters:
    basis_df (DataFrame): The basis DataFrame with unique id_column.
    dataframes (list of DataFrame): List of DataFrames to use for enrichment, id_column may not be unique.
    id_column (str): The name of the column to join on (e.g., 'id').
    columns (list of str): List of column names for which to find the first non-null value.

    Returns:
    DataFrame: The enriched DataFrame.
    """
    # Check input types
    if not isinstance(basis_df, DataFrame):
        raise ValueError("basis_df should be a PySpark DataFrame")
    if not isinstance(dataframes, list) or not all(
        isinstance(df, DataFrame) for df in dataframes
    ):
        raise ValueError("dataframes should be a list of PySpark DataFrames")
    if not isinstance(id_column, str):
        raise ValueError("id_column should be a string")
    if not isinstance(columns, list) or not all(isinstance(c, str) for c in columns):
        raise ValueError("columns should be a list of strings")

    # For each DataFrame, aggregate to get first non-null values per id
    aggregated_dfs = []
    for idx, df in enumerate(dataframes):
        suffix = f"_df{idx+1}"
        agg_exprs = [first(c, ignorenulls=True).alias(c + suffix) for c in columns]
        df_agg = df.groupBy(id_column).agg(*agg_exprs)
        aggregated_dfs.append(df_agg)

    # Now join the aggregated DataFrames with the basis DataFrame
    # Start with the basis DataFrame
    result_df = basis_df
    for idx, agg_df in enumerate(aggregated_dfs):
        result_df = result_df.join(agg_df, on=id_column, how="left")

    # Now for each column, coalesce the basis column and the columns from aggregated DataFrames
    select_exprs = [col(id_column)]
    for c in columns:
        coalesce_expr = coalesce(
            col(c), *[col(f"{c}_df{idx+1}") for idx in range(len(aggregated_dfs))]
        ).alias(c)
        select_exprs.append(coalesce_expr)

    result_df = result_df.select(select_exprs)
    return result_df

In [13]:
# Basis DataFrame with unique id_column
data_basis = [(1, None, "A10", "Apple"), (2, 20, None, None), (3, 30, "A10", None)]
df_basis = spark.createDataFrame(data_basis, ["id", "value", "VSNR", "Text"])

# Other DataFrames where id is not unique
data2 = [(1, None, "A10", None), (1, 20, None, "Banana"), (3, 30, "A10", None)]
df2 = spark.createDataFrame(data2, ["id", "value", "VSNR", "Text"])

data3 = [(1, 10, None, "Apple"), (2, None, "B20", "Banana"), (3, 30, "A10", "Apple")]
df3 = spark.createDataFrame(data3, ["id", "value", "VSNR", "Text"])

# Applying the function
columns = ["value", "VSNR", "Text"]
enriched_df = enrich_with_first_non_null(df_basis, [df2, df3], "id", columns)

# Display the result
enriched_df.show()

+---+-----+----+------+
| id|value|VSNR|  Text|
+---+-----+----+------+
|  1|   20| A10| Apple|
|  3|   30| A10| Apple|
|  2|   20| B20|Banana|
+---+-----+----+------+



24/10/26 06:00:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 4462956 ms exceeds timeout 120000 ms
24/10/26 06:00:02 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/26 06:00:05 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.

# Changing Multiple DataFrames at once

In [None]:
from pyspark.sql import DataFrame


# Example function to transform a DataFrame
def transform_dataframe(df: DataFrame) -> DataFrame:
    # Apply some transformations (e.g., dropping nulls, exploding columns, etc.)
    return df.dropna()  # Example transformation


# List of DataFrames (can come from various sources)
dataframes_list = [df1, df2, df3]  # Assume df1, df2, df3 are existing DataFrames

# Initialize an empty dictionary to store transformed DataFrames
transformed_dfs = {}

# Loop through the list of DataFrames
for i, df in enumerate(dataframes_list):
    # Apply the transformation function to each DataFrame
    transformed_df = transform_dataframe(df)

    # Store the transformed DataFrame in the dictionary with a dynamic key
    transformed_dfs[f"df_{i+1}"] = transformed_df

# Now transformed_dfs contains all the transformed DataFrames

In [None]:
# dictionary comprehension
from pyspark.sql import DataFrame


# Example function to transform a DataFrame
def transform_dataframe(df: DataFrame) -> DataFrame:
    # Apply some transformations (e.g., dropping nulls, exploding columns, etc.)
    return df.dropna()  # Example transformation


# Dictionary of DataFrames (keys are names, values are DataFrames)
dataframes_dict = {"df1": df1, "df2": df2, "df3": df3}

# Transform all DataFrames using a dictionary comprehension
transformed_dfs = {
    name: transform_dataframe(df) for name, df in dataframes_dict.items()
}

# Now transformed_dfs contains all the transformed DataFrames with their original names

In [None]:
from functools import reduce
from pyspark.sql import DataFrame


# Example transformation functions
def drop_nulls(df: DataFrame) -> DataFrame:
    return df.dropna()


def add_new_column(df: DataFrame) -> DataFrame:
    return df.withColumn("new_col", F.lit("example_value"))


def transform_dataframe(df: DataFrame) -> DataFrame:
    # List of transformation functions
    transformations = [drop_nulls, add_new_column]
    # Apply all transformations in sequence
    return reduce(lambda acc, func: func(acc), transformations, df)


# Dictionary of DataFrames
dataframes_dict = {"df1": df1, "df2": df2, "df3": df3}

# Apply transformation to each DataFrame in the dictionary
transformed_dfs = {
    name: transform_dataframe(df) for name, df in dataframes_dict.items()
}

In [None]:
for name, df in dataframes_dict.items():
    print(f"Processing DataFrame: {name}")
    # Apply different transformations based on the DataFrame name
    if name == "df1":
        transformed_df = complex_transformations(df)
    elif name == "df2":
        # Maybe df2 needs a special transformation
        transformed_df = df.withColumn("special_col", F.lit("special_value"))
    else:
        # Default transformation for other DataFrames
        transformed_df = df.withColumn("default_col", F.lit("default_value"))

    # Store the transformed DataFrame
    transformed_dfs[name] = transformed_df

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, sum as spark_sum


def find_empty_columns(df: DataFrame) -> list:
    """
    Identifies empty columns in a DataFrame.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame to check for empty columns.

    Returns
    -------
    List[str]
        A list of column names that are empty in the DataFrame.
    """
    empty_columns = (
        df.select(
            [
                (spark_sum(col(c).isNull().cast("int")) == df.count()).alias(c)
                for c in df.columns
            ]
        )
        .first()
        .asDict()
    )
    return [col_name for col_name, is_empty in empty_columns.items() if is_empty]


# Example usage:
# empty_cols = find_empty_columns(your_dataframe)
# print(f"Empty columns: {empty_cols}")