**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("Combining-data")\
      .getOrCreate()
spark

**create the Dataframe**

In [0]:
# Employee Data
employee_data = [ 
    ("Ananya", "HR", 52000), 
    ("Rahul", "Engineering", 65000), 
    ("Priya", "Engineering", 60000), 
    ("Zoya", "Marketing", 48000), 
    ("Karan", "HR", 53000), 
    ("Naveen", "Engineering", 70000), 
    ("Fatima", "Marketing", 45000) 
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)
# Performance Data
performance_data = [ 
    ("Ananya", 2023, 4.5), 
    ("Rahul", 2023, 4.9), 
    ("Priya", 2023, 4.3), 
    ("Zoya", 2023, 3.8), 
    ("Karan", 2023, 4.1), 
    ("Naveen", 2023, 4.7), 
    ("Fatima", 2023, 3.9) 
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance_data, columns_perf)
# Project Data
project_data = [ 
    ("Ananya", "HR Portal", 120), 
    ("Rahul", "Data Platform", 200), 
    ("Priya", "Data Platform", 180), 
    ("Zoya", "Campaign Tracker", 100), 
    ("Karan", "HR Portal", 130), 
    ("Naveen", "ML Pipeline", 220), 
    ("Fatima", "Campaign Tracker", 90) 
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)
df_emp.show()
df_perf.show()
df_proj.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



**Join all three DataFrames**

In [0]:
#1.Join all three DataFrames
print("Joined DataFrames:")
joined = df_emp.join(df_perf, "Name", "inner") \
                  .join(df_proj, "Name", "inner")
joined.show()
#2.Compute total hours worked per department
from pyspark.sql.functions import sum
print("Total hours worked per department:")
joined.groupBy("Department").agg(sum("HoursWorked").alias("TotalHours")).show()
#3.Compute average rating per project
from pyspark.sql.functions import avg
print("Average rating per project:")
joined.groupBy("Project").agg(avg("Rating").alias("AvgRating")).show()

Joined DataFrames:
+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+

Total hours worked per department:
+-----------+----------+
| Department|TotalHours|
+-----------+----------+
|         HR|       250|
|Engineering|       600|
|  Marketing|       190|
+-----------+----------+

Average ra

**Handling Missing Data**

In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
# 4. Add a row with null rating
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Rating", DoubleType(), True),  
    StructField("Project", StringType(), True),
    StructField("HoursWorked", IntegerType(), True)
])
# Create the new row as a DataFrame with schema
new_data = [("Deepak", "HR", 50000, 2023, None, "HR Portal", 100)]
new_df = spark.createDataFrame(new_data, schema)
df_with_null = joined.unionByName(new_df)
print("New DataFrame with null rating:")
df_with_null.show()
# 5. Filter rows with null values
from pyspark.sql.functions import col
print("Filtered rows with null ratings:")
df_with_null.filter(col("Rating").isNull()).show()
# 6. Replace null ratings with department average
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, when
print("Replacing null ratings with department average")
window_spec = Window.partitionBy("Department")
df_filled = df_with_null.withColumn(
    "Rating",
    when(col("Rating").isNull(), avg("Rating").over(window_spec)).otherwise(col("Rating"))
)
df_filled.show()


New DataFrame with null rating:
+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
|Deepak|         HR| 50000|2023|  NULL|       HR Portal|        100|
+------+-----------+------+----+------+----------------+-----------+

Filtered rows with null ratings:
+------+----------+------+----+------+---------+-----------+
|  Name|Department|Salary|Year|Rating|  Proje

**Built-In Functions and UDF**

In [0]:
#7.Create PerformanceCategory column
df_filled = df_filled.withColumn(
    "PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when(col("Rating") >= 4.0, "Good")
    .otherwise("Average")
)
print("PerformanceCategory column:")
df_filled.show()
#8.Create UDF to assign bonus
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def bonus(hours):
    return 10000 if hours > 200 else 5000
bonus_udf = udf(bonus, IntegerType())
df_filled = df_filled.withColumn("Bonus", bonus_udf(col("HoursWorked")))
print("Bonus column:")
df_filled.show()

PerformanceCategory column:
+------+-----------+------+----+------+----------------+-----------+-------------------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|PerformanceCategory|
+------+-----------+------+----+------+----------------+-----------+-------------------+
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|               Good|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|          Excellent|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|          Excellent|
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|               Good|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|               Good|
|Deepak|         HR| 50000|2023|   4.3|       HR Portal|        100|               Good|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|            Average|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|            Av

**Date and Time Functions**

In [0]:
#9.Add JoinDate and MonthsWorked
from pyspark.sql.functions import lit, months_between, current_date, to_date
df_filled = df_filled.withColumn("JoinDate", to_date(lit("2021-06-01"))) \
                     .withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")))
print("JoinDate and MonthsWorked columns:")
df_filled.show()
#10.Employees who joined before 2022
print("Employees who joined before 2022:")
df_filled.filter(col("JoinDate") < "2022-01-01").select("Name", "JoinDate").show()

JoinDate and MonthsWorked columns:
+------+-----------+------+----+------+----------------+-----------+-------------------+-----+----------+------------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|PerformanceCategory|Bonus|  JoinDate|MonthsWorked|
+------+-----------+------+----+------+----------------+-----------+-------------------+-----+----------+------------+
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|               Good| 5000|2021-06-01| 48.32258065|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|          Excellent| 5000|2021-06-01| 48.32258065|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|          Excellent|10000|2021-06-01| 48.32258065|
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|               Good| 5000|2021-06-01| 48.32258065|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|               Good| 5000|2021-06-01| 48.32258065|
|Deepak|     

**Unions**

In [0]:
#11.Union with extra employees
extra_employees = [("Meena", "HR", 48000), ("Raj", "Marketing", 51000)]
df_extra = spark.createDataFrame(extra_employees, ["Name", "Department", "Salary"])
all_employees = df_emp.unionByName(df_extra)
print("All employees:")
all_employees.show()

All employees:
+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



**Saving Results**

In [0]:
#11.Save final dataset as partitioned Parquet by Department
df_filled.write.partitionBy("Department").mode("overwrite").parquet("/FileStore/final_employee_data")

In [0]:
a = spark.read.parquet("/FileStore/final_employee_data")
a.show()

+------+------+----+------+----------------+-----------+-------------------+-----+----------+------------+-----------+
|  Name|Salary|Year|Rating|         Project|HoursWorked|PerformanceCategory|Bonus|  JoinDate|MonthsWorked| Department|
+------+------+----+------+----------------+-----------+-------------------+-----+----------+------------+-----------+
|Naveen| 70000|2023|   4.7|     ML Pipeline|        220|          Excellent|10000|2021-06-01| 48.32258065|Engineering|
| Rahul| 65000|2023|   4.9|   Data Platform|        200|          Excellent| 5000|2021-06-01| 48.32258065|Engineering|
| Priya| 60000|2023|   4.3|   Data Platform|        180|               Good| 5000|2021-06-01| 48.32258065|Engineering|
|Fatima| 45000|2023|   3.9|Campaign Tracker|         90|            Average| 5000|2021-06-01| 48.32258065|  Marketing|
|  Zoya| 48000|2023|   3.8|Campaign Tracker|        100|            Average| 5000|2021-06-01| 48.32258065|  Marketing|
|Deepak| 50000|2023|   4.3|       HR Portal|    