In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExercise").getOrCreate()
spark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnan, avg, udf, current_date, months_between, to_date, lit, count
from pyspark.sql.types import IntegerType, StringType, FloatType

employee_data = [
    ("Ananya", "HR", 50000),
    ("Rahul", "Engineering", 75000),
    ("Priya", "Engineering", 62000),
    ("Zoya", "Marketing", 58000),
    ("Karan", "HR", 52000),
    ("Naveen", "Engineering", 80000),
    ("Fatima", "Marketing", 49000)
]
columns_emp = ["Name", "Department", "Salary"]
empdf = spark.createDataFrame(employee_data, columns_emp)

performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9),
]
columns_perf = ["Name", "Year", "Rating"]
perfdf = spark.createDataFrame(performance, columns_perf)

project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
projdf = spark.createDataFrame(project_data, columns_proj)

empdf.show()
perfdf.show()
projdf.show()



+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 50000|
| Rahul|Engineering| 75000|
| Priya|Engineering| 62000|
|  Zoya|  Marketing| 58000|
| Karan|         HR| 52000|
|Naveen|Engineering| 80000|
|Fatima|  Marketing| 49000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



In [0]:
# Joins and Advanced Aggregations
# 1. Join employee_data , performance_data , and project_data .

joined = empdf.join(perfdf, on="Name", how="inner").join(projdf, on="Name", how="inner")

# 2. Compute total hours worked per department.
hours_by_dept = joined.groupBy("Department").sum("HoursWorked")

# 3. Compute average rating per project.
# Handling Missing Data (introduce some manually)
avg_rating_proj = joined.groupBy("Project").avg("Rating")

# 4. Add a row to performance_data with a None rating.
null_rows = perfdf.filter(col("Rating").isNull())

# 6. Replace null ratings with the department average.
dept_avg = empdf.join(perfdf, "Name").groupBy("Department").agg(avg("Rating").alias("dept_avg"))
perfdf_with_dept = empdf.join(perfdf, "Name").join(dept_avg, "Department")
perfdf_filled = perfdf_with_dept.withColumn("Rating",when(col("Rating").isNull(), col("dept_avg")).otherwise(col("Rating"))).select("Name", "Year", "Rating")

# Built-In Functions and UDF
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)
joined = joined.withColumn("PerformanceCategory",when(col("Rating") >= 4.7, "Excellent").when((col("Rating") >= 4.0) & (col("Rating") < 4.7), "Good")
                           .otherwise("Average"))

# 8. Create a UDF to assign bonus:
# If project hours > 200 → 10,000 Else → 5,000
# Date and Time Functions
def calc_bonus(hours):
    return 10000 if hours > 200 else 5000
bonus_udf = udf(calc_bonus, IntegerType())
joined = joined.withColumn("Bonus", bonus_udf(col("HoursWorked")))

# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as
# difference from today.
joined = joined.withColumn("JoinDate", to_date(lit("2021-06-01")))
joined = joined.withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")))

# 10. Calculate how many employees joined before 2022.
# Unions
joined_before_2022 = joined.filter(col("JoinDate") < to_date(lit("2022-01-01")))

# 11. Create another small team DataFrame and union() it with employee_data .
# extra_employees = [
# ("Meena", "HR", 48000),
# ("Raj", "Marketing", 51000)
# ]
extra_employees = [("Meena", "HR", 48000), ("Raj", "Marketing", 51000)]
extra_df = spark.createDataFrame(extra_employees, columns_emp)
empdf_union = empdf.union(extra_df)

# Saving Results
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based
# on Department .
joined.write.mode("overwrite").partitionBy("Department").parquet("/mnt/data/final_employee_partitioned")

empdf.show()
perfdf.show()
projdf.show()
joined.select("Name", "Salary", "Rating", "Bonus", "PerformanceCategory").show()
hours_by_dept.show()
avg_rating_proj.show()
null_rows.show()
joined_before_2022.select("Name", "JoinDate").show()
empdf_union.show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 50000|
| Rahul|Engineering| 75000|
| Priya|Engineering| 62000|
|  Zoya|  Marketing| 58000|
| Karan|         HR| 52000|
|Naveen|Engineering| 80000|
|Fatima|  Marketing| 49000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+

+------+------+------+-----+------------