In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("CombineData") \
                    .getOrCreate()
spark

Reading CSV files

### Dataset: Combine Existing Data

In [0]:
employee_data = [ 
    ("Ananya", "HR", 52000), 
    ("Rahul", "Engineering", 65000), 
    ("Priya", "Engineering", 60000), 
    ("Zoya", "Marketing", 48000), 
    ("Karan", "HR", 53000), 
    ("Naveen", "Engineering", 70000), 
    ("Fatima", "Marketing", 45000) 
]
employee_columns = ["Name", "Department", "Salary"]
employee_df = spark.createDataFrame(employee_data, employee_columns)

performance_data = [ 
    ("Ananya", 2023, 4.5), 
    ("Rahul", 2023, 4.9), 
    ("Priya", 2023, 4.3), 
    ("Zoya", 2023, 3.8), 
    ("Karan", 2023, 4.1), 
    ("Naveen", 2023, 4.7), 
    ("Fatima", 2023, 3.9) 
]
performance_column = ["Name", "Year", "Rating"]
performance_df = spark.createDataFrame(performance_data, performance_column)

# Project Data
project_data = [ 
    ("Ananya", "HR Portal", 120), 
    ("Rahul", "Data Platform", 200), 
    ("Priya", "Data Platform", 180), 
    ("Zoya", "Campaign Tracker", 100), 
    ("Karan", "HR Portal", 130), 
    ("Naveen", "ML Pipeline", 220), 
    ("Fatima", "Campaign Tracker", 90) 
]
project_column = ["Name", "Project", "HoursWorked"]
project_df = spark.createDataFrame(project_data, project_column)


### Dataset 3: project_data

###  PySpark Exercises – Set 3 (Project, Nulls, Functions)

#### Joins and Advanced Aggregations

1. Join employee_data, performance_data and project _data .

In [0]:
emp_perf_df = employee_df.join(performance_df, on="Name", how="inner")
full_df = emp_perf_df.join(project_df, on="Name", how="inner")

full_df.show()

+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



2. Compute total hours worked per department

In [0]:
full_df.groupBy("Department") \
                 .agg(sum("HoursWorked").alias("TotalHoursWorked")) \
                 .show()

+-----------+----------------+
| Department|TotalHoursWorked|
+-----------+----------------+
|         HR|             250|
|Engineering|             600|
|  Marketing|             190|
+-----------+----------------+



3. Compute average rating per project

In [0]:
from pyspark.sql.functions import avg, round
average_rating = full_df.groupBy("Project") \
                        .agg(round(avg("Rating"),2).alias("AverageRating"))

average_rating.show()

+----------------+-------------+
|         Project|AverageRating|
+----------------+-------------+
|       HR Portal|          4.3|
|   Data Platform|          4.6|
|Campaign Tracker|         3.85|
|     ML Pipeline|          4.7|
+----------------+-------------+



#### Handling Missing Data (introduce some manually)

4. Add a row to performance_data with a None rating .

In [0]:
from pyspark.sql.functions import col, when

new_data = [("Ashwin Harish", 2024, None)]
new_data_df = spark.createDataFrame(new_data, schema=performance_df.schema)


performance_df = performance_df.union(new_data_df)

performance_df.show()

+-------------+----+------+
|         Name|Year|Rating|
+-------------+----+------+
|       Ananya|2023|   4.5|
|        Rahul|2023|   4.9|
|        Priya|2023|   4.3|
|         Zoya|2023|   3.8|
|        Karan|2023|   4.1|
|       Naveen|2023|   4.7|
|       Fatima|2023|   3.9|
|Ashwin Harish|2024|  NULL|
+-------------+----+------+



5. Filter rows with null values

In [0]:
performance_df.filter(performance_df.Rating.isNull()) \
              .show()

+-------------+----+------+
|         Name|Year|Rating|
+-------------+----+------+
|Ashwin Harish|2024|  NULL|
+-------------+----+------+



6. Replace null ratings with the department average

In [0]:
dept_avg = full_df.groupBy("Department") \
                  .agg(avg("Rating").alias("DeptAvgRating"))

full_df_with_avg = full_df.join(dept_avg, on="Department", how="left")

performance_df = full_df_with_avg.withColumn("Rating",
                                            when(col("Rating").isNull(), col("DeptAvgRating")).otherwise(col("Rating"))) \
                                            .drop("DeptAvgRating")

performance_df.show()

+-----------+------+------+----+------+----------------+-----------+
| Department|  Name|Salary|Year|Rating|         Project|HoursWorked|
+-----------+------+------+----+------+----------------+-----------+
|         HR|Ananya| 52000|2023|   4.5|       HR Portal|        120|
|Engineering| Rahul| 65000|2023|   4.9|   Data Platform|        200|
|Engineering| Priya| 60000|2023|   4.3|   Data Platform|        180|
|         HR| Karan| 53000|2023|   4.1|       HR Portal|        130|
|  Marketing|  Zoya| 48000|2023|   3.8|Campaign Tracker|        100|
|Engineering|Naveen| 70000|2023|   4.7|     ML Pipeline|        220|
|  Marketing|Fatima| 45000|2023|   3.9|Campaign Tracker|         90|
+-----------+------+------+----+------+----------------+-----------+



####  Built-In Functions and UDF

7. Create a column \
PerformanceCategory : \
 Excellent (>=4.7), \
 Good (4.0–4.69), \
 Average (<4.0)

In [0]:
performance_df = performance_df.withColumn("PerformanceCategory", 
                                           when(performance_df.Rating >= 4.7, "Excellent")
                                           .when((performance_df.Rating >= 4.0) & (performance_df.Rating <= 4.69), "Good")
                                           .otherwise("Average")
                                           )
performance_df.show()

+-----------+------+------+----+------+----------------+-----------+-------------------+
| Department|  Name|Salary|Year|Rating|         Project|HoursWorked|PerformanceCategory|
+-----------+------+------+----+------+----------------+-----------+-------------------+
|         HR|Ananya| 52000|2023|   4.5|       HR Portal|        120|               Good|
|Engineering| Rahul| 65000|2023|   4.9|   Data Platform|        200|          Excellent|
|Engineering| Priya| 60000|2023|   4.3|   Data Platform|        180|               Good|
|  Marketing|  Zoya| 48000|2023|   3.8|Campaign Tracker|        100|            Average|
|         HR| Karan| 53000|2023|   4.1|       HR Portal|        130|               Good|
|Engineering|Naveen| 70000|2023|   4.7|     ML Pipeline|        220|          Excellent|
|  Marketing|Fatima| 45000|2023|   3.9|Campaign Tracker|         90|            Average|
+-----------+------+------+----+------+----------------+-----------+-------------------+



3. Create a UDF to assign bonus based on HoursWorked

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def bonus_udf(hours):
    if hours is None:
        return 0 
    return 10000 if hours > 200 else 5000

bonus = udf(bonus_udf, IntegerType())

performance_df = performance_df.withColumn("Bonus", bonus(col("HoursWorked")))
performance_df.show()

+-----------+------+------+----+------+----------------+-----------+-------------------+-----+
| Department|  Name|Salary|Year|Rating|         Project|HoursWorked|PerformanceCategory|Bonus|
+-----------+------+------+----+------+----------------+-----------+-------------------+-----+
|         HR|Ananya| 52000|2023|   4.5|       HR Portal|        120|               Good| 5000|
|Engineering| Priya| 60000|2023|   4.3|   Data Platform|        180|               Good| 5000|
|Engineering| Rahul| 65000|2023|   4.9|   Data Platform|        200|          Excellent| 5000|
|  Marketing|  Zoya| 48000|2023|   3.8|Campaign Tracker|        100|            Average| 5000|
|         HR| Karan| 53000|2023|   4.1|       HR Portal|        130|               Good| 5000|
|Engineering|Naveen| 70000|2023|   4.7|     ML Pipeline|        220|          Excellent|10000|
|  Marketing|Fatima| 45000|2023|   3.9|Campaign Tracker|         90|            Average| 5000|
+-----------+------+------+----+------+-----------

#### Date and Time Functions

9. Add a column JoinDate with 2021-06-01 for all, then add 
difference from today.

In [0]:
from pyspark.sql.functions import lit, to_date, date_diff, current_date

employee_df = employee_df.withColumn("JoinDate", to_date(lit("2021-06-01")))
employee_df = employee_df.withColumn("Difference_in_days", date_diff(current_date(), employee_df.JoinDate).alias('date_diff'))
employee_df.show()

+------+-----------+------+----------+------------------+
|  Name| Department|Salary|  JoinDate|Difference_in_days|
+------+-----------+------+----------+------------------+
|Ananya|         HR| 52000|2021-06-01|              1471|
| Rahul|Engineering| 65000|2021-06-01|              1471|
| Priya|Engineering| 60000|2021-06-01|              1471|
|  Zoya|  Marketing| 48000|2021-06-01|              1471|
| Karan|         HR| 53000|2021-06-01|              1471|
|Naveen|Engineering| 70000|2021-06-01|              1471|
|Fatima|  Marketing| 45000|2021-06-01|              1471|
+------+-----------+------+----------+------------------+



10. Calculate how many employees joined before 2022.

In [0]:
employee_count = employee_df.filter(col("JoinDate") < "2022-01-01").count()
print(f"Number of employees joined before 2022: {employee_count}")

Number of employees joined before 2022: 7


#### Unions

11. create another small team DataFrame and union() it with employee_data .

extra_employees = \
[ 
("Meena", "HR", 48000), 
("Raj", "Marketing", 51000) 
]

In [0]:
new_employees_data = [
    ("Meena", "HR", 48000, "2021-06-01", 1471),
    ("Raj", "Marketing", 51000, "2021-06-01", 1471)
]

new_employees_columns = ["Name", "Department", "Salary", "JoinDate", "Difference_in_days"]

new_employees_df = spark.createDataFrame(new_employees_data, new_employees_columns)

combined_employee_df = employee_df.union(new_employees_df)

combined_employee_df.show()

+------+-----------+------+----------+------------------+
|  Name| Department|Salary|  JoinDate|Difference_in_days|
+------+-----------+------+----------+------------------+
|Ananya|         HR| 52000|2021-06-01|              1471|
| Rahul|Engineering| 65000|2021-06-01|              1471|
| Priya|Engineering| 60000|2021-06-01|              1471|
|  Zoya|  Marketing| 48000|2021-06-01|              1471|
| Karan|         HR| 53000|2021-06-01|              1471|
|Naveen|Engineering| 70000|2021-06-01|              1471|
|Fatima|  Marketing| 45000|2021-06-01|              1471|
| Meena|         HR| 48000|2021-06-01|              1471|
|   Raj|  Marketing| 51000|2021-06-01|              1471|
+------+-----------+------+----------+------------------+



#### Saving Result

12. save the final merged dataset (all 3 joins) as a partitioned parquet file based
on Department .

In [0]:
full_df.write \
    .partitionBy("Department") \
    .mode("overwrite") \
    .parquet("dbfs:/FileStore/joined_data_parquet")