In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg

# Initialize Spark session
spark = SparkSession.builder.appName("ListComprehensionsExample").getOrCreate()

# Sample data
data = [
    ("Alice", "Math", 85),
    ("Alice", "English", 78),
    ("Bob", "Math", 92),
    ("Bob", "English", 83),
    ("Charlie", "Math", 79),
    ("Charlie", "English", 82),
]

# Creating DataFrame
columns = ["Name", "Subject", "Score"]
df = spark.createDataFrame(data, columns)

# Show the original DataFrame
df.show()

# List of columns to aggregate
columns_to_agg = ["Score"]

# Aggregation functions
agg_funcs = [("sum", "Total"), ("avg", "Average")]

# Mapping of function names to actual PySpark functions
func_mapping = {"sum": sum, "avg": avg}

# Generate aggregation expressions using list comprehension
agg_exprs = [
    func_mapping[func](col(col_name)).alias(f"{prefix}_{col_name}")
    for col_name in columns_to_agg
    for func, prefix in agg_funcs
]

# Group by and aggregate with dynamic columns and functions
dynamic_grouped_df = df.groupBy("Name").agg(*agg_exprs)

# Show the dynamically aggregated DataFrame
dynamic_grouped_df.show()

# Stop the Spark session
spark.stop()

24/05/25 14:11:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+-------+-------+-----+
|   Name|Subject|Score|
+-------+-------+-----+
|  Alice|   Math|   85|
|  Alice|English|   78|
|    Bob|   Math|   92|
|    Bob|English|   83|
|Charlie|   Math|   79|
|Charlie|English|   82|
+-------+-------+-----+

+-------+-----------+-------------+
|   Name|Total_Score|Average_Score|
+-------+-----------+-------------+
|  Alice|        163|         81.5|
|    Bob|        175|         87.5|
|Charlie|        161|         80.5|
+-------+-----------+-------------+



In [None]:
# Initialize Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ListComprehensionsExample").getOrCreate()

# Sample data
data = [("Alice", "Math", 85), ("Bob", "English", 78), ("Charlie", "Math", 92)]
columns = ["Name", "Subject", "Score"]
df = spark.createDataFrame(data, columns)

# List of columns to select
columns_to_select = ["Name", "Score"]

# Using list comprehension to select columns
selected_df = df.select([col(column) for column in columns_to_select])
selected_df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, udf
from pyspark.sql.types import StringType

# Initialize Spark session
spark = SparkSession.builder.appName("RenameValuesExample").getOrCreate()

# Sample data
data = [("Alice", "Math", 85), ("Bob", "English", 78), ("Charlie", "Math", 92)]
columns = ["Name", "Subject", "Score"]
df = spark.createDataFrame(data, columns)

# Show the original DataFrame
df.show()

# Dictionary for mapping old values to new values
subject_mapping = {"Math": "Mathematics", "English": "English Literature"}

# 1. Using Dictionary Mapping
renamed_df = df.withColumn(
    "RenamedSubject",
    when(col("Subject").isin(subject_mapping.keys()), col("Subject")).otherwise(
        "Other"
    ),
)
for old_val, new_val in subject_mapping.items():
    renamed_df = renamed_df.withColumn(
        "RenamedSubject",
        when(col("RenamedSubject") == old_val, new_val).otherwise(
            col("RenamedSubject")
        ),
    )
renamed_df.show()

# 2. Using List Comprehensions
prefix = "Subject: "
prefixed_df = df.withColumn(
    "PrefixedSubject",
    col("Subject").rlike("|".join(subject_mapping.keys())).alias("PrefixedSubject"),
).withColumn(
    "PrefixedSubject",
    when(col("PrefixedSubject") == True, prefix + col("Subject")).otherwise(
        col("Subject")
    ),
)
prefixed_df.show()

# 3. Using Custom Functions (UDF)


def rename_subject(subject):
    if subject in subject_mapping:
        return subject_mapping[subject]
    else:
        return subject


rename_subject_udf = udf(rename_subject, StringType())
udf_renamed_df = df.withColumn(
    "RenamedSubject", rename_subject_udf(col("Subject")))
udf_renamed_df.show()

# 4. Using replace Method
replace_renamed_df = df.replace(to_replace=subject_mapping, subset=["Subject"])
replace_renamed_df.show()

# Stop the Spark session

In [None]:
threshold = 10

# List of columns to check against the threshold
columns_to_check = [
    col_name for col_name in df_event_count.columns if col_name != "CASE_KEY"
]

# Create new columns with the condition using list comprehension
conditional_columns = [
    (F.when(F.col(f"`{column}`") > threshold, 1).otherwise(0)).alias(
        f"{column}_gt_{threshold}"
    )
    for column in columns_to_check
]

# Select original columns and new conditional columns
result_df = df_event_count.select("*", *conditional_columns)

result_df.show()

In [None]:
from pyspark.sql import functions as F

# Sample DataFrame
data = [("Alice", 34, "F"), ("Bob", 45, "M")]
df = spark.createDataFrame(data, ["name", "age", "gender"])

# Create a map column from 'age' and 'gender'
df_with_map = df.withColumn(
    "info_map",
    F.create_map(F.lit("age"), F.col("age"), F.lit("gender"), F.col("gender")),
)

df_with_map.show(truncate=False)

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName(
    "WithColumnListComprehension").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame([(1, "Alice", 50), (2, "Bob", 45)], [
                           "id", "name", "score"])

# List of new column expressions
new_columns = [
    ("score_plus_10", col("score") + 10),
    ("double_score", col("score") * 2),
    ("is_passing", col("score") > 40),
]

# Creating a new DataFrame with additional columns using list comprehension
df_with_columns = df.select(
    "*", *[col_expr.alias(col_name) for col_name, col_expr in new_columns]
)

df_with_columns.show()

# Stop the Spark session
spark.stop()

                                                                                

+---+-----+-----+-------------+------------+----------+
| id| name|score|score_plus_10|double_score|is_passing|
+---+-----+-----+-------------+------------+----------+
|  1|Alice|   50|           60|         100|      true|
|  2|  Bob|   45|           55|          90|      true|
+---+-----+-----+-------------+------------+----------+



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("WithColumnsDictionary").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame([(1, "Alice", 50), (2, "Bob", 45)], [
                           "id", "name", "score"])

# Dictionary mapping new column names to expressions
new_columns_dict = {
    "score_plus_10": col("score") + 10,
    "double_score": col("score") * 2,
    "is_passing": col("score") > 40,
}

# Add multiple columns using withColumn in combination with a dictionary
for col_name, col_expr in new_columns_dict.items():
    df = df.withColumn(col_name, col_expr)

df.show()

# Stop the Spark session
spark.stop()

                                                                                

+---+-----+-----+-------------+------------+----------+
| id| name|score|score_plus_10|double_score|is_passing|
+---+-----+-----+-------------+------------+----------+
|  1|Alice|   50|           60|         100|      true|
|  2|  Bob|   45|           55|          90|      true|
+---+-----+-----+-------------+------------+----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName(
    "FilterMultipleConditionsVariables").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame(
    [
        (1, "Alice", 50, "HR"),
        (2, "Bob", 45, "IT"),
        (3, "Charlie", 60, "Finance"),
        (4, "David", 40, "HR"),
    ],
    ["id", "name", "score", "department"],
)

# Define conditions in variables
condition1 = col("score") > 40
condition2 = col("score") < 60
condition3 = col("department") == "HR"

# Combine conditions using '&' (and)
combined_condition = condition1 & condition2 & condition3

# Apply the combined condition to filter the DataFrame
filtered_df = df.filter(combined_condition)

filtered_df.show()

# Stop the Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName(
    "FilterMultipleConditionsDictionary").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame(
    [
        (1, "Alice", 50, "HR"),
        (2, "Bob", 45, "IT"),
        (3, "Charlie", 60, "Finance"),
        (4, "David", 40, "HR"),
    ],
    ["id", "name", "score", "department"],
)

# Define conditions in a dictionary
conditions = {
    "score_gt_40": col("score") > 40,
    "score_lt_60": col("score") < 60,
    "department_hr": col("department") == "HR",
}

# Combine conditions using '&' (and)
combined_condition = (
    conditions["score_gt_40"] & conditions["score_lt_60"] & conditions["department_hr"]
)

# Apply the combined condition to filter the DataFrame
filtered_df = df.filter(combined_condition)

filtered_df.show()

# Stop the Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("NestedConditions").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame(
    [
        (1, "Alice", 50, "HR"),
        (2, "Bob", 45, "IT"),
        (3, "Charlie", 60, "Finance"),
        (4, "David", 40, "HR"),
        (5, "Eve", 30, "IT"),
    ],
    ["id", "name", "score", "department"],
)

# Nested conditions
condition = (col("score") > 40) & (
    (col("name") == "Alice") | (col("department") == "HR")
)

# Apply the nested condition to filter the DataFrame
filtered_df = df.filter(condition)

filtered_df.show()

# Stop the Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("ComplexNestedConditions").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame(
    [
        (1, "Alice", 50, "HR"),
        (2, "Bob", 45, "IT"),
        (3, "Charlie", 60, "Finance"),
        (4, "David", 40, "HR"),
        (5, "Eve", 30, "IT"),
    ],
    ["id", "name", "score", "department"],
)

# Complex nested conditions
condition = (
    ((col("score") > 40) & (col("score") < 60)) | (
        col("department") == "Finance")
) & (col("name") != "David")

# Apply the complex nested condition to filter the DataFrame
filtered_df = df.filter(condition)

filtered_df.show()

# Stop the Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName(
    "DictionaryNestedConditions").getOrCreate()

# Sample DataFrame
df = spark.createDataFrame(
    [
        (1, "Alice", 50, "HR"),
        (2, "Bob", 45, "IT"),
        (3, "Charlie", 60, "Finance"),
        (4, "David", 40, "HR"),
        (5, "Eve", 30, "IT"),
    ],
    ["id", "name", "score", "department"],
)

# Define conditions in a dictionary
conditions = {
    "score_between_40_60": (col("score") > 40) & (col("score") < 60),
    "department_finance": col("department") == "Finance",
    "name_not_david": col("name") != "David",
}

# Combine conditions using logical operators
combined_condition = (
    conditions["score_between_40_60"] | conditions["department_finance"]
) & conditions["name_not_david"]

# Apply the combined condition to filter the DataFrame
filtered_df = df.filter(combined_condition)

filtered_df.show()

# Stop the Spark session
spark.stop()