In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Data Ingestion & Schema Handling

In [0]:
# Load the CSV using inferred schema.
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Users/azuser3548_mml.local@techademy.com/employee_timesheet.csv")
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+



In [0]:
# Load the same file with schema explicitly defined.
from pyspark.sql.types import *
schema = StructType([
    StructField("EmployeeID", StringType()),
    StructField("Name", StringType()),
    StructField("Department", StringType()),
    StructField("Project", StringType()),
    StructField("WorkHours", IntegerType()),
    StructField("WorkDate", DateType()),
    StructField("Location", StringType()),
    StructField("Mode", StringType())
])
df_manual = spark.read.option("header", True).schema(schema).csv("file:/Workspace/Users/azuser3548_mml.local@techademy.com/employee_timesheet.csv")
df_manual.show()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+



In [0]:
# Add a new column Weekday extracted from WorkDate .
from pyspark.sql.functions import date_format
df = df.withColumn("Weekday", date_format("WorkDate", "EEEE"))
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+



Aggregations & Grouping

In [0]:
# Calculate total work hours by employee.
from pyspark.sql.functions import sum
df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+



In [0]:
# Calculate average work hours per department.
from pyspark.sql.functions import avg
df.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+



In [0]:
# Get top 2 employees by total hours using window function.
from pyspark.sql.functions import desc, rank
from pyspark.sql.window import Window
window_spec = Window.orderBy(desc("TotalHours"))
top_hours = df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
top_hours.withColumn("Rank", rank().over(window_spec)).filter("Rank <= 2").show()

+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



Date Operations

In [0]:
# Filter entries where WorkDate falls on a weekend.
from pyspark.sql.functions import dayofweek
df.filter(dayofweek("WorkDate").isin([1, 7])).show()

+----------+----+----------+-------+---------+----------+--------+------+--------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode| Weekday|
+----------+----+----------+-------+---------+----------+--------+------+--------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|Saturday|
+----------+----+----------+-------+---------+----------+--------+------+--------+



In [0]:
# Calculate running total of hours per employee using window.
window_spec2 = Window.partitionBy("EmployeeID").orderBy("WorkDate")
df.withColumn("RunningTotal", sum("WorkHours").over(window_spec2)).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|           7|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|          15|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|           5|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|           6|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+



Joining DataFrames

In [0]:
# Create department_location.csv :
# Department,DeptHead
# IT,Anand
# HR,Shruti
# Finance,Kamal
dept_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Users/azuser3548_mml.local@techademy.com/department_location.csv")

+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



In [0]:
# Join with timesheet data and list all employees with their DeptHead.
df_joined = df.join(dept_df, "Department", "left")
df_joined.select("EmployeeID", "Name", "Department", "DeptHead").show()

+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



Pivot & Unpivot

In [0]:
# Pivot table: total hours per employee per project.
df.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()

+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



In [0]:
# Unpivot example: Convert mode-specific hours into rows.
from pyspark.sql.functions import expr
unpivot = "stack(1, Mode, WorkHours) as (Mode, Hours)"
df.select("EmployeeID", "Name", expr(unpivot)).show()

+----------+-----+------+-----+
|EmployeeID| Name|  Mode|Hours|
+----------+-----+------+-----+
|      E101|Anita|Remote|    8|
|      E102|  Raj|Onsite|    7|
|      E103| John|Remote|    5|
|      E101|Anita|Remote|    9|
|      E104|Meena|Onsite|    6|
|      E102|  Raj|Remote|    8|
+----------+-----+------+-----+



UDF & Conditional Logic

In [0]:
# Create a UDF to classify work hours:
# def workload_tag(hours):
# if hours >= 8: return "Full"
# elif hours >= 4: return "Partial"
# else: return "Light"
# Add a column WorkloadCategory using this UDF.

from pyspark.sql.functions import udf
def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"
workload_udf = udf(workload_tag, StringType())
df = df.withColumn("WorkloadCategory", workload_udf("WorkHours"))
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

Nulls and Cleanup

In [0]:
# Introduce some nulls in Mode column.
from pyspark.sql.functions import when, col
df_null = df.withColumn("Mode", when(col("EmployeeID") == "E102", None).otherwise(col("Mode")))
df_null.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|  NULL|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|  NULL| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

In [0]:
# Fill nulls with "Not Provided".
df_filled = df_null.fillna({"Mode": "Not Provided"})
df_filled.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Not Provided| Saturday|            Full|
+----------+-----+-

In [0]:
# Drop rows where WorkHours < 4.
cleaned_df = df_filled.filter(col("WorkHours") >= 4)
cleaned_df.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Not Provided| Saturday|            Full|
+----------+-----+-

Advanced Conditions

In [0]:
# Use when-otherwise to mark employees as "Remote Worker" if >80% entries are Remote.
from pyspark.sql.functions import sum, count, when
remote = df.groupBy("EmployeeID").agg(
    (sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("Mode")).alias("remote_ratio"))
remote_flag = remote.withColumn("WorkerType", when(col("remote_ratio") > 0.8, "Remote Worker").otherwise("Onsite/Mixed"))
remote_flag.show()

+----------+------------+-------------+
|EmployeeID|remote_ratio|   WorkerType|
+----------+------------+-------------+
|      E103|         1.0|Remote Worker|
|      E104|         0.0| Onsite/Mixed|
|      E101|         1.0|Remote Worker|
|      E102|         0.5| Onsite/Mixed|
+----------+------------+-------------+



In [0]:
# Add a new column ExtraHours where hours > 8.
df = df.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|         0|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|         0|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|         0|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|         1|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|         0|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|   

Union + Duplicate Handling

In [0]:
# Append a dummy timesheet for new interns using unionByName() .
dummy_df = spark.createDataFrame([
    ("E999", "Intern", "IT", "Onboard", 6, "2024-05-05", "Bangalore", "Remote", "Sunday", "Partial", 0)
], df.columns)
combined_df = df.unionByName(dummy_df)
combined_df.show()

+----------+------+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|EmployeeID|  Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|ExtraHours|
+----------+------+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|      E101| Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|         0|
|      E102|   Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|         0|
|      E103|  John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|         0|
|      E101| Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|         1|
|      E104| Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|         0|
|      E102|   Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Sat

In [0]:
# Remove duplicate rows based on all columns.
final_df = combined_df.dropDuplicates()
final_df.show()

+----------+------+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|EmployeeID|  Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|ExtraHours|
+----------+------+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|      E101| Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|         0|
|      E104| Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|         0|
|      E103|  John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|         0|
|      E101| Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|         1|
|      E102|   Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|         0|
|      E102|   Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wedn