In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("Data Manipulation Example").getOrCreate()

# Sample DataFrame
data = [
    (1, "Alice", 100),
    (2, "Bob", 200),
    (3, "Charlie", 150),
]

columns = ["id", "name", "score"]
df = spark.createDataFrame(data, columns)

# Show original DataFrame
df.show()

# Create a new column 'score_squared' by squaring the 'score' column
df_with_new_column = df.withColumn("score_squared", F.col("score") ** 2)

df_with_new_column.select("score_squared").show()



+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|  100|
|  2|    Bob|  200|
|  3|Charlie|  150|
+---+-------+-----+

+-------------+
|score_squared|
+-------------+
|      10000.0|
|      40000.0|
|      22500.0|
+-------------+



In [0]:
# Use selectExpr to create a new column 'score_with_bonus'
df_with_bonus = df.selectExpr("id", "name", "score", "score * 1.1 as score_with_bonus")

# Show modified DataFrame
df_with_bonus.show()


+---+-------+-----+----------------+
| id|   name|score|score_with_bonus|
+---+-------+-----+----------------+
|  1|  Alice|  100|           110.0|
|  2|    Bob|  200|           220.0|
|  3|Charlie|  150|           165.0|
+---+-------+-----+----------------+



In [0]:
# Sample DataFrame with additional data for aggregation
data = [
    (1, "Alice", 100, "A"),
    (2, "Bob", 200, "B"),
    (3, "Charlie", 150, "A"),
    (4, "David", 300, "B"),
    (5, "Eve", 200, "A")
]

columns = ["id", "name", "score", "group"]
df = spark.createDataFrame(data, columns)

# Show original DataFrame
df.show()

# Group by 'group' and calculate the average score
df_grouped = df.groupBy("group").agg(
    F.avg("score").alias("avg_score"),
    F.sum("score").alias("total_score"),
    F.count("id").alias("count")
)

# Show the aggregated DataFrame
df_grouped.show()


+---+-------+-----+-----+
| id|   name|score|group|
+---+-------+-----+-----+
|  1|  Alice|  100|    A|
|  2|    Bob|  200|    B|
|  3|Charlie|  150|    A|
|  4|  David|  300|    B|
|  5|    Eve|  200|    A|
+---+-------+-----+-----+

+-----+---------+-----------+-----+
|group|avg_score|total_score|count|
+-----+---------+-----------+-----+
|    A|    150.0|        450|    3|
|    B|    250.0|        500|    2|
+-----+---------+-----------+-----+

