PySpark Exercises – Set 2 (Advanced)

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AdvancedEmployeeTasks").getOrCreate()

In [2]:
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

GroupBy and Aggregations

In [4]:
# 1. Get the average salary by department
from pyspark.sql.functions import avg
df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  65000.0|
|         HR|  52500.0|
|  Marketing|  46500.0|
+-----------+---------+



In [5]:
#  2. Count of employees per department
df_emp.groupBy("Department").count().show()

+-----------+-----+
| Department|count|
+-----------+-----+
|Engineering|    3|
|         HR|    2|
|  Marketing|    2|
+-----------+-----+



In [6]:
# 3. Max and Min salary in Engineering
from pyspark.sql.functions import max, min
df_emp.filter(df_emp.Department == "Engineering").select(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



Join and Combine Data

In [9]:
#  4. Inner join on Name
df_joined = df_emp.join(df_perf, on="Name", how="inner")
df_joined.show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+



In [10]:
# 5. Show each employee's salary and rating
df_joined.select("Name", "Salary", "Rating").show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+



In [14]:
# 6. Filter employees with rating > 4.5 and salary > 60000.
from pyspark.sql.functions import col
df_joined.filter((col('Rating') > 4.5) & (col('Salary') > 60000)).show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



Window & Rank (Bonus Challenge)

In [16]:
# 7. Rank employees by salary department-wise.
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
window_spec = Window.partitionBy("Department").orderBy(df_emp["Salary"].desc())
df_emp.withColumn("Rank", row_number().over(window_spec)).show()

+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+



In [18]:
# 8. Cumulative salary in each department
from pyspark.sql.functions import sum
windowCum = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, 0)
df_cumsum = df_joined.withColumn("CumulativeSalary", sum("Salary").over(windowCum))
df_cumsum.select("Name", "Department", "Salary", "CumulativeSalary").show()

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
| Priya|Engineering| 60000|           60000|
| Rahul|Engineering| 65000|          125000|
|Naveen|Engineering| 70000|          195000|
|Ananya|         HR| 52000|           52000|
| Karan|         HR| 53000|          105000|
|Fatima|  Marketing| 45000|           45000|
|  Zoya|  Marketing| 48000|           93000|
+------+-----------+------+----------------+



Date Operations

In [19]:
# 9. Add JoinDate with random dates (2020–2023)
from datetime import datetime, timedelta
import random
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def random_date(start_year=2020, end_year=2023):
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    delta = end - start
    random_days = random.randint(0, delta.days)
    return (start + timedelta(days=random_days)).strftime("%Y-%m-%d")

random_date_udf = udf(random_date, StringType())
df_dated = df_joined.withColumn("JoinDate", random_date_udf())
df_dated.show()


+------+-----------+------+----+------+----------+
|  Name| Department|Salary|Year|Rating|  JoinDate|
+------+-----------+------+----+------+----------+
|Ananya|         HR| 52000|2023|   4.5|2020-05-02|
|Fatima|  Marketing| 45000|2023|   3.9|2022-04-15|
| Karan|         HR| 53000|2023|   4.1|2023-12-24|
|Naveen|Engineering| 70000|2023|   4.7|2021-02-18|
| Priya|Engineering| 60000|2023|   4.3|2021-09-11|
| Rahul|Engineering| 65000|2023|   4.9|2023-08-09|
|  Zoya|  Marketing| 48000|2023|   3.8|2022-05-07|
+------+-----------+------+----+------+----------+



In [21]:
# 10. Add YearsWithCompany
from pyspark.sql.functions import current_date, datediff, col, round
df_with_tenure = df_dated.withColumn("JoinDate", col("JoinDate").cast("date"))
df_with_tenure = df_with_tenure.withColumn("YearsWithCompany", round(datediff(current_date(), col("JoinDate")) / 365, 2))
df_with_tenure.select("Name", "JoinDate", "YearsWithCompany").show()

+------+----------+----------------+
|  Name|  JoinDate|YearsWithCompany|
+------+----------+----------------+
|Ananya|2021-03-04|            4.27|
|Fatima|2023-01-24|            2.38|
| Karan|2022-02-08|            3.34|
|Naveen|2022-04-19|            3.15|
| Priya|2021-10-29|            3.62|
| Rahul|2022-08-09|            2.84|
|  Zoya|2020-12-10|             4.5|
+------+----------+----------------+



Writing to Files

In [22]:
#  11. Write full employee DataFrame to CSV with headers
df_emp.write.csv("employee_data.csv", header=True)

In [23]:
# 12. Save the joined DataFrame to Parquet
df_joined.write.parquet("joined_data.parquet")

In [24]:
# Read back the saved CSV file:
df_read_csv = spark.read.csv("employee_data.csv", header=True, inferSchema=True)
df_read_csv.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
+------+-----------+------+



In [25]:
# Read back the saved Parquet file:
df_read_parquet = spark.read.parquet("joined_data.parquet")
df_read_parquet.show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+

