**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("assignment-3")\
      .getOrCreate()
spark

**Data Ingestion & Schema Handling**

In [0]:
#1.Load the CSV using inferred schema.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])
inferred=spark.read.option("header", True).csv("file:/Workspace/Shared/employee_timesheet.csv", inferSchema=True)
inferred.printSchema()
inferred.show()
#2.Load the same file with schema explicitly defined.
emp=spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/employee_timesheet.csv")
emp.printSchema()
emp.show()
#3.Add a new column Weekday extracted from WorkDate.
from pyspark.sql.functions import *
weekday=emp.withColumn("Weekday", date_format(col("WorkDate"), "EEEE"))

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode : string (nullable = true)

+----------+-----+----------+-------+---------+----------+---------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode |
+----------+-----+----------+-------+---------+----------+---------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote |
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote |
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite |
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   M

**Aggregations & Grouping**

In [0]:
#4.Calculate total work hours by employee.
print("total work hours by employee:")
emp.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()
#5.Calculate average work hours per department.
print("average work hours per department:")
emp.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()
#6.Get top 2 employees by total hours using window function.
from pyspark.sql.window import Window
total_hours=emp.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
w=Window.orderBy(desc("TotalHours"))
total_hours.withColumn("Rank", row_number().over(w)).filter("Rank <= 2").show()

total work hours by employee:
+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+

average work hours per department:
+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+

+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



**Date Operations**

In [0]:
#7.Filter entries where WorkDate falls on a weekend.
print("entries where WorkDate falls on a weekend:")
emp.filter(dayofweek("WorkDate").isin([1, 7])).show()
#8.Calculate running total of hours per employee using window.
print("running total of hours per employee:")
windowEmp = Window.partitionBy("EmployeeID").orderBy("WorkDate")
emp.withColumn("RunningTotal",sum("WorkHours").over(windowEmp)).show()

entries where WorkDate falls on a weekend:
+----------+----+----------+-------+---------+----------+--------+------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode|
+----------+----+----------+-------+---------+----------+--------+------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|
+----------+----+----------+-------+---------+----------+--------+------+

running total of hours per employee:
+----------+-----+----------+-------+---------+----------+---------+-------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|   Mode|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+-------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote |          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |           7|
|      E1

**Joining DataFrames**

In [0]:
# 9.Create department_location.csv
dept=spark.createDataFrame([
    ("IT", "Anand"),
    ("HR", "Shruti"),
    ("Finance", "Kamal")
], ["Department", "DeptHead"])
emp.join(dept, on="Department", how="left").select("EmployeeID", "Name", "Department", "DeptHead").show()
#10.Join with timesheet data and list all employees with their DeptHead.
emp.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()

+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+

+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



**Pivot & Unpivot**

In [0]:
#11.Pivot table: total hours per employee per project.
#12.Unpivot example: Convert mode-specific hours into rows.
#Clean Mode values before pivoting
cleaned_emp = emp.withColumn("Mode", trim(col("Mode")))
#Pivot the cleaned data
pivoted = cleaned_emp.groupBy("EmployeeID").pivot("Mode").agg(sum("WorkHours"))
#Rename columns to avoid ambiguity
for col_name in pivoted.columns:
    if col_name.strip() != col_name:
        pivoted = pivoted.withColumnRenamed(col_name, col_name.strip())
unpivot_expr = "stack(2, 'Remote', Remote, 'Onsite', Onsite) as (Mode, ModeHours)"
pivoted.select("EmployeeID", expr(unpivot_expr)).filter("ModeHours is not null").show()

+----------+------+---------+
|EmployeeID|  Mode|ModeHours|
+----------+------+---------+
|      E103|Remote|        5|
|      E104|Onsite|        6|
|      E101|Remote|       17|
|      E102|Remote|        8|
|      E102|Onsite|        7|
+----------+------+---------+



**UDF & Conditional Logic**

In [0]:
#13.Create a UDF to classify work hours
def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"
workload_udf = udf(workload_tag, StringType())
#14.Add a column WorkloadCategory using this UDF
emp=emp.withColumn("WorkloadCategory", workload_udf("WorkHours"))
emp.show()

+----------+-----+----------+-------+---------+----------+---------+-------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|   Mode|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+-------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote |         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote |            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite |         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai| Remote|            Full|
+----------+-----+----------+-------+---------+----------+---------+-------+----------------+



**Nulls and Cleanup**

In [0]:
#15. Introduce some nulls in Mode column.
print("introduce some nulls in Mode column:")
nulls = emp.withColumn("Mode", when(col("EmployeeID") == "E104", None).otherwise(col("Mode")))
#16.Fill nulls with "Not Provided".
print("fill nulls with 'Not Provided':")
filled=nulls.fillna({"Mode": "Not Provided"})
filled.show()
#17.Drop rows where WorkHours < 4.
print("drop rows where WorkHours < 4:")
dropped=filled.filter(col("WorkHours") >= 4)
dropped.show()

introduce some nulls in Mode column:
fill nulls with 'Not Provided':
+----------+-----+----------+-------+---------+----------+---------+------------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|     Remote |            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|     Onsite |         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|     Remote |         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|     Remote |            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Not Provided|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote|            Full|
+----------+-----+----------+-------+---

**Advanced Conditions**

In [0]:
#18.Use when-otherwise to mark employees as "Remote Worker" if >80% entries areRemote.
ratio = emp.groupBy("EmployeeID").agg(
    (spark_sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio")
)
ratio.withColumn("WorkerType", when(col("RemoteRatio") > 0.8, "Remote Worker").otherwise("Mixed")).show()
#19.Add a new column ExtraHours where hours > 8
emp=emp.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
print("ExtraHours where hours > 8:")
emp.show()

+----------+-----------+----------+
|EmployeeID|RemoteRatio|WorkerType|
+----------+-----------+----------+
|      E103|        0.0|     Mixed|
|      E104|        0.0|     Mixed|
|      E101|        0.0|     Mixed|
|      E102|        0.5|     Mixed|
+----------+-----------+----------+

ExtraHours where hours > 8:
+----------+-----+----------+-------+---------+----------+---------+-------+----------------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|   Mode|WorkloadCategory|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+-------+----------------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |            Full|         0|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |         Partial|         0|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote |         Partial|         0|
|      E101|Anita|        IT|  Alpha|        9|2024-0

**Union + Duplicate Handling**

In [0]:
#20.Append a dummy timesheet for new interns using unionbyname()
from datetime import date
schema = schema.add("WorkloadCategory", StringType(), True)
schema = schema.add("ExtraHours", IntegerType(), True)
intern = spark.createDataFrame([
    ("E200", "Intern1", "IT", "Delta", 5, date(2024, 5, 5), "Bangalore", "Remote", "Light", 0)
], schema=schema)
#21.Remove duplicate rows based on all columns.
union = emp.unionByName(intern)
union.dropDuplicates().show()

+----------+-------+----------+-------+---------+----------+---------+-------+----------------+----------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|   Mode|WorkloadCategory|ExtraHours|
+----------+-------+----------+-------+---------+----------+---------+-------+----------------+----------+
|      E104|  Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite |         Partial|         0|
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote |         Partial|         0|
|      E101|  Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |            Full|         0|
|      E102|    Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |         Partial|         0|
|      E102|    Raj|        HR|   Beta|        8|2024-05-04|   Mumbai| Remote|            Full|         0|
|      E101|  Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote |            Full|         1|
|      E200|Intern1|        IT|  Delt