In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from typing import List, Dict

# Initialize a Spark session
spark = SparkSession.builder.master("local").appName("TestSelectColumns").getOrCreate()

# Sample data for testing
data1 = [("Alice", 1, 23, "NY"), ("Bob", 2, 34, "CA")]
data2 = [("Carol", "HR", 5000), ("Dave", "Finance", 7000)]
data3 = [(1, "Electronics", "Phone"), (2, "Clothing", "Shirt")]

# Create DataFrames from the sample data
df1 = spark.createDataFrame(data1, ["name", "id", "age", "state"])
df2 = spark.createDataFrame(data2, ["name", "department", "salary"])
df3 = spark.createDataFrame(data3, ["product_id", "category", "product_name"])


# Example function to select columns from a DataFrame
def select_columns(df: DataFrame, columns: List[str]) -> DataFrame:
    """
    Select specified columns from a DataFrame.

    Parameters
    ----------
    df : DataFrame
        Input PySpark DataFrame.
    columns : List[str]
        List of column names to select.

    Returns
    -------
    DataFrame
        Transformed DataFrame containing only specified columns.
    """
    return df.select(*columns)


# List of DataFrames and columns to select for testing
dataframes_list = [df1, df2, df3]
columns_to_select = [
    ["name", "age"],  # Selecting columns for df1
    ["name", "salary"],  # Selecting columns for df2
    ["category", "product_name"],  # Selecting columns for df3
]

# Initialize an empty dictionary to store transformed DataFrames
transformed_dfs: Dict[str, DataFrame] = {}

# Apply the transformation and store in dictionary
for i, (df, cols) in enumerate(zip(dataframes_list, columns_to_select)):
    transformed_df = select_columns(df, cols)
    transformed_dfs[f"df_{i+1}"] = transformed_df

# Show results to verify the transformation
for key, transformed_df in transformed_dfs.items():
    print(f"Transformed DataFrame {key}:")
    transformed_df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/26 12:03:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Transformed DataFrame df_1:


                                                                                

+-----+---+
| name|age|
+-----+---+
|Alice| 23|
|  Bob| 34|
+-----+---+

Transformed DataFrame df_2:
+-----+------+
| name|salary|
+-----+------+
|Carol|  5000|
| Dave|  7000|
+-----+------+

Transformed DataFrame df_3:
+-----------+------------+
|   category|product_name|
+-----------+------------+
|Electronics|       Phone|
|   Clothing|       Shirt|
+-----------+------------+



In [2]:
from pyspark.sql import DataFrame
from typing import List, Dict


def select_columns_with_prefix(df: DataFrame, prefix: str) -> DataFrame:
    """
    Select columns from a DataFrame that start with a specific prefix.

    Parameters
    ----------
    df : DataFrame
        Input PySpark DataFrame.
    prefix : str
        Prefix to filter column names.

    Returns
    -------
    DataFrame
        Transformed DataFrame containing only columns with the specified prefix.
    """
    # Filter columns with the specified prefix
    columns_to_select = [col for col in df.columns if col.startswith(prefix)]
    return df.select(*columns_to_select)


# Example usage
def process_dataframes_with_prefix(
    dataframes: List[DataFrame], prefix: str
) -> Dict[str, DataFrame]:
    """
    Process a list of DataFrames, selecting columns with a specific prefix for each.

    Parameters
    ----------
    dataframes : List[DataFrame]
        List of DataFrames to process.
    prefix : str
        Prefix to filter column names.

    Returns
    -------
    Dict[str, DataFrame]
        Dictionary of transformed DataFrames with keys as "df_1", "df_2", etc.
    """
    transformed_dfs: Dict[str, DataFrame] = {}

    # Loop through each DataFrame and apply the prefix-based column selection
    for i, df in enumerate(dataframes):
        transformed_df = select_columns_with_prefix(df, prefix)
        transformed_dfs[f"df_{i+1}"] = transformed_df

    return transformed_dfs


# Example Test Code
if __name__ == "__main__":
    from pyspark.sql import SparkSession

    # Initialize Spark session
    spark = (
        SparkSession.builder.master("local")
        .appName("PrefixColumnSelection")
        .getOrCreate()
    )

    # Create example DataFrames
    data1 = [("Alice", 1, 23, "NY"), ("Bob", 2, 34, "CA")]
    data2 = [("Carol", "HR", 5000), ("Dave", "Finance", 7000)]
    data3 = [(1, "Electronics", "Phone"), (2, "Clothing", "Shirt")]

    df1 = spark.createDataFrame(data1, ["name", "user_id", "age", "state"])
    df2 = spark.createDataFrame(data2, ["name", "department", "dept_salary"])
    df3 = spark.createDataFrame(data3, ["product_id", "category", "product_name"])

    # Test the function
    dataframes_list = [df1, df2, df3]
    prefix = "dept"  # Selecting columns with prefix "dept"

    # Process the DataFrames
    result_dfs = process_dataframes_with_prefix(dataframes_list, prefix)

    # Display results
    for key, transformed_df in result_dfs.items():
        print(f"Transformed DataFrame {key}:")
        transformed_df.show()

24/10/26 12:06:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Transformed DataFrame df_1:
++
||
++
||
||
++

Transformed DataFrame df_2:
+-----------+
|dept_salary|
+-----------+
|       5000|
|       7000|
+-----------+

Transformed DataFrame df_3:
++
||
++
||
||
++



In [3]:
from pyspark.sql import DataFrame
from typing import List, Dict
from pyspark.sql.types import NumericType


def select_numeric_columns(df: DataFrame) -> DataFrame:
    """
    Select only numeric columns from a DataFrame.

    Parameters
    ----------
    df : DataFrame
        Input PySpark DataFrame.

    Returns
    -------
    DataFrame
        Transformed DataFrame containing only numeric columns.
    """
    # Filter columns based on their data type being a subclass of NumericType
    numeric_columns = [
        field.name
        for field in df.schema.fields
        if isinstance(field.dataType, NumericType)
    ]
    return df.select(*numeric_columns)


# Function to process a list of DataFrames, selecting only numeric columns
def process_dataframes_numeric_only(
    dataframes: List[DataFrame],
) -> Dict[str, DataFrame]:
    """
    Process a list of DataFrames, selecting only numeric columns for each.

    Parameters
    ----------
    dataframes : List[DataFrame]
        List of DataFrames to process.

    Returns
    -------
    Dict[str, DataFrame]
        Dictionary of transformed DataFrames with keys as "df_1", "df_2", etc.
    """
    transformed_dfs: Dict[str, DataFrame] = {}

    # Loop through each DataFrame and apply the numeric column selection
    for i, df in enumerate(dataframes):
        transformed_df = select_numeric_columns(df)
        transformed_dfs[f"df_{i+1}"] = transformed_df

    return transformed_dfs


# Example Test Code
if __name__ == "__main__":
    from pyspark.sql import SparkSession
    from pyspark.sql.types import (
        StructType,
        StructField,
        StringType,
        IntegerType,
        FloatType,
    )

    # Initialize Spark session
    spark = (
        SparkSession.builder.master("local")
        .appName("NumericColumnSelection")
        .getOrCreate()
    )

    # Create example DataFrames with various column types
    data1 = [("Alice", 1, 23.5, "NY"), ("Bob", 2, 34.8, "CA")]
    data2 = [("Carol", "HR", 5000.0), ("Dave", "Finance", 7000.5)]
    data3 = [(1, "Electronics", 100.99), (2, "Clothing", 50.25)]

    schema1 = StructType(
        [
            StructField("name", StringType(), True),
            StructField("user_id", IntegerType(), True),
            StructField("age", FloatType(), True),
            StructField("state", StringType(), True),
        ]
    )

    schema2 = StructType(
        [
            StructField("name", StringType(), True),
            StructField("department", StringType(), True),
            StructField("dept_salary", FloatType(), True),
        ]
    )

    schema3 = StructType(
        [
            StructField("product_id", IntegerType(), True),
            StructField("category", StringType(), True),
            StructField("price", FloatType(), True),
        ]
    )

    df1 = spark.createDataFrame(data1, schema1)
    df2 = spark.createDataFrame(data2, schema2)
    df3 = spark.createDataFrame(data3, schema3)

    # Test the function
    dataframes_list = [df1, df2, df3]

    # Process the DataFrames to select only numeric columns
    result_dfs = process_dataframes_numeric_only(dataframes_list)

    # Display results
    for key, transformed_df in result_dfs.items():
        print(f"Transformed DataFrame {key}:")
        transformed_df.show()

24/10/26 12:08:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Transformed DataFrame df_1:
+-------+----+
|user_id| age|
+-------+----+
|      1|23.5|
|      2|34.8|
+-------+----+

Transformed DataFrame df_2:
+-----------+
|dept_salary|
+-----------+
|     5000.0|
|     7000.5|
+-----------+

Transformed DataFrame df_3:
+----------+------+
|product_id| price|
+----------+------+
|         1|100.99|
|         2| 50.25|
+----------+------+



In [8]:
from pyspark.sql import DataFrame
from typing import Dict, Callable


def apply_transformation_to_dict(
    dataframes_dict: Dict[str, DataFrame],
    transformation: Callable[[DataFrame], DataFrame],
) -> Dict[str, DataFrame]:
    """
    Apply a transformation to each DataFrame in a dictionary and store results in a new dictionary.

    Parameters
    ----------
    dataframes_dict : Dict[str, DataFrame]
        Dictionary where keys are names and values are DataFrames to transform.
    transformation : Callable[[DataFrame], DataFrame]
        Transformation function to apply to each DataFrame.

    Returns
    -------
    Dict[str, DataFrame]
        Dictionary with the same keys, containing transformed DataFrames.
    """
    transformed_dfs = {}

    for name, df in dataframes_dict.items():
        # Apply transformation to each DataFrame and store it with the same key
        transformed_dfs[name] = transformation(df)

    return transformed_dfs


# Example transformation function
def example_transformation(df: DataFrame) -> DataFrame:
    """
    Example transformation: remove rows with null values and select specific columns.

    Parameters
    ----------
    df : DataFrame
        DataFrame to transform.

    Returns
    -------
    DataFrame
        Transformed DataFrame.
    """
    return df.select("*")


# Example usage
if __name__ == "__main__":
    from pyspark.sql import SparkSession

    # Initialize Spark session
    spark = (
        SparkSession.builder.master("local").appName("DictTransformation").getOrCreate()
    )

    # Example data for creating DataFrames
    data1 = [("Alice", 1, 23, "NY"), ("Bob", 2, None, "CA")]
    data2 = [("Carol", "HR", 5000), ("Dave", "Finance", None)]

    # Create DataFrames
    df1 = spark.createDataFrame(data1, ["name", "user_id", "age", "state"])
    df2 = spark.createDataFrame(data2, ["name", "department", "salary"])

    # Dictionary of DataFrames
    dataframes_dict = {"df1": df1, "df2": df2}

    # Apply transformation and retrieve transformed DataFrames in a new dictionary
    transformed_dataframes_dict = apply_transformation_to_dict(
        dataframes_dict, example_transformation
    )

    # Display results to verify
    for name, transformed_df in transformed_dataframes_dict.items():
        print(f"Transformed DataFrame '{name}':")
        transformed_df.show()

Transformed DataFrame 'df1':
+-----+-------+----+-----+
| name|user_id| age|state|
+-----+-------+----+-----+
|Alice|      1|  23|   NY|
|  Bob|      2|null|   CA|
+-----+-------+----+-----+

Transformed DataFrame 'df2':
+-----+----------+------+
| name|department|salary|
+-----+----------+------+
|Carol|        HR|  5000|
| Dave|   Finance|  null|
+-----+----------+------+



24/10/27 09:25:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1003757 ms exceeds timeout 120000 ms
24/10/27 09:25:19 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/27 09:25:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.