PySpark Exercises – Set 3 (Project, Nulls, Functions)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()

In [2]:
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)
df_emp.show()
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()
project_data = [
("Ananya", "HR Portal", 120),
("Rahul", "Data Platform", 200),
("Priya", "Data Platform", 180),
("Zoya", "Campaign Tracker", 100),
("Karan", "HR Portal", 130),
("Naveen", "ML Pipeline", 220),
("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)
df_proj.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



Joins and Advanced Aggregations

In [4]:
# 1. Join employee_data , performance_data , and project_data .
df_joined=df_emp.join(df_perf, "Name").join(df_proj, "Name")
df_joined.show()

+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
+------+-----------+------+----+------+----------------+-----------+



In [5]:
# 2. Compute total hours worked per department.
from pyspark.sql.functions import sum
total_hours_per_dept = df_proj.groupBy("Project").agg(sum("HoursWorked").alias("TotalHoursWorked"))
total_hours_per_dept.show()

+----------------+----------------+
|         Project|TotalHoursWorked|
+----------------+----------------+
|   Data Platform|             380|
|       HR Portal|             250|
|     ML Pipeline|             220|
|Campaign Tracker|             190|
+----------------+----------------+



In [7]:
# 3. Compute average rating per project.
from pyspark.sql.functions import avg
avg_rating_per_project = df_joined.groupBy("Project").agg(avg("Rating").alias("AverageRating"))
avg_rating_per_project.show()

+----------------+------------------+
|         Project|     AverageRating|
+----------------+------------------+
|   Data Platform|               4.6|
|       HR Portal|               4.3|
|     ML Pipeline|               4.7|
|Campaign Tracker|3.8499999999999996|
+----------------+------------------+



Handling Missing Data (introduce some manually)

In [9]:
# 4. Add a row to performance_data with a None rating.
from pyspark.sql import Row
new_row = Row("Meena", 2023, None)
df_perf_null = df_perf.union(spark.createDataFrame([new_row], df_perf.schema))
df_perf_null.show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
| Meena|2023|  NULL|
+------+----+------+



In [10]:
# 5. Filter Rows with Null Rating
filtered_df = df_perf_null.filter(df_perf_null["Rating"].isNull())
filtered_df.show()

+-----+----+------+
| Name|Year|Rating|
+-----+----+------+
|Meena|2023|  NULL|
+-----+----+------+



In [14]:
# 6.Replace Null Ratings with Department Average
df_temp = df_emp.join(df_perf_null, "Name")
dept_avg = df_temp.groupBy("Department").agg(avg("Rating").alias("DeptAvg"))
df_filled = df_temp.join(dept_avg, "Department").withColumn("FilledRating", when(col("Rating").isNull(), col("DeptAvg")).otherwise(col("Rating"))).drop("Rating").withColumnRenamed("FilledRating", "Rating")
df_filled.show()

+-----------+------+------+----+------------------+------+
| Department|  Name|Salary|Year|           DeptAvg|Rating|
+-----------+------+------+----+------------------+------+
|Engineering| Rahul| 65000|2023| 4.633333333333334|   4.9|
|Engineering| Priya| 60000|2023| 4.633333333333334|   4.3|
|Engineering|Naveen| 70000|2023| 4.633333333333334|   4.7|
|         HR| Karan| 53000|2023|               4.3|   4.1|
|         HR|Ananya| 52000|2023|               4.3|   4.5|
|  Marketing|  Zoya| 48000|2023|3.8499999999999996|   3.8|
|  Marketing|Fatima| 45000|2023|3.8499999999999996|   3.9|
+-----------+------+------+----+------------------+------+



Built-In Functions and UDF

In [15]:
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)
from pyspark.sql.functions import when
df_cat = df_filled.withColumn(
    "PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when(col("Rating") >= 4.0, "Good")
    .otherwise("Average"))
df_cat.show()

+-----------+------+------+----+------------------+------+-------------------+
| Department|  Name|Salary|Year|           DeptAvg|Rating|PerformanceCategory|
+-----------+------+------+----+------------------+------+-------------------+
|Engineering| Rahul| 65000|2023| 4.633333333333334|   4.9|          Excellent|
|Engineering| Priya| 60000|2023| 4.633333333333334|   4.3|               Good|
|Engineering|Naveen| 70000|2023| 4.633333333333334|   4.7|          Excellent|
|         HR| Karan| 53000|2023|               4.3|   4.1|               Good|
|         HR|Ananya| 52000|2023|               4.3|   4.5|               Good|
|  Marketing|  Zoya| 48000|2023|3.8499999999999996|   3.8|            Average|
|  Marketing|Fatima| 45000|2023|3.8499999999999996|   3.9|            Average|
+-----------+------+------+----+------------------+------+-------------------+



In [16]:
# 8. Create a UDF to assign bonus:
# If project hours > 200 → 10,000
# Else → 5,000
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def assign_bonus(hours):
    return 10000 if hours > 200 else 5000

bonus_udf = udf(assign_bonus, IntegerType())
df_final = df_joined.withColumn("Bonus", bonus_udf(col("HoursWorked")))
df_final.show()

+------+-----------+------+----+------+----------------+-----------+-----+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|Bonus|
+------+-----------+------+----+------+----------------+-----------+-----+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120| 5000|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180| 5000|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200| 5000|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|10000|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90| 5000|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100| 5000|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130| 5000|
+------+-----------+------+----+------+----------------+-----------+-----+



Date and Time Functions

In [18]:
# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as difference from today.
from pyspark.sql.functions import current_date, months_between, to_date, lit
df_date = df_final.withColumn("JoinDate", to_date(lit("2021-06-01"))) \
    .withColumn("MonthsWorked", round(months_between(current_date(), col("JoinDate")), 1))
df_date.show()

+------+-----------+------+----+------+----------------+-----------+-----+----------+------------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|Bonus|  JoinDate|MonthsWorked|
+------+-----------+------+----+------+----------------+-----------+-----+----------+------------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120| 5000|2021-06-01|        48.3|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180| 5000|2021-06-01|        48.3|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200| 5000|2021-06-01|        48.3|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|10000|2021-06-01|        48.3|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90| 5000|2021-06-01|        48.3|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100| 5000|2021-06-01|        48.3|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130| 5000|2021-06-01|        48.3|
+------+--

In [20]:
# 10. Calculate how many employees joined before 2022.
df_date.filter(year(col("JoinDate")) < 2022).count()

7

Unions

In [21]:
# 11. Create another small team DataFrame and union() it with employee_data .
# extra_employees = [
# ("Meena", "HR", 48000),
# ("Raj", "Marketing", 51000)
# ]
extra_employees = [("Meena", "HR", 48000), ("Raj", "Marketing", 51000)]
df_extra = spark.createDataFrame(extra_employees, ["Name", "Department", "Salary"])
df_all = df_emp.union(df_extra)
df_all.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



Saving Results

In [22]:
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based on Department .
df_date.write.mode("overwrite").parquet("final_employee_data")