In [0]:
spark

In [0]:
#  Task Set – Intermediate to Advanced PySpark (No DLT)
#  Data Ingestion & Schema Handling
#  1. Load the CSV using inferred schema.
from pyspark.sql.functions import *
empdf=spark.read.csv("file:/Workspace/Shared/employee17.csv",header=True,inferSchema=True)
empdf.show()
empdf.printSchema()
#  2. Load the same file with schema explicitly defined.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])

df_explicit = spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/employee17.csv")
df_explicit.printSchema()
df_explicit.show()

#  3. Add a new column Weekday extracted from WorkDate .
empdf=empdf.withColumn("Weekday",date_format(col("WorkDate"),"EEEE"))
empdf.show()
#  Aggregations & Grouping
#  4. Calculate total work hours by employee.
tothrs=empdf.groupBy("EmployeeID").agg(sum("WorkHours").alias("TotalHours"))
#  5. Calculate average work hours per department.
empdf.groupBy("Department").agg(avg("WorkHours").alias("AverageHours")).show()
#  6. Get top 2 employees by total hours using window function.
from pyspark.sql.window import Window
window_spec = Window.orderBy(tothrs["TotalHours"].desc())
top_2_employees = tothrs.withColumn("rank", row_number().over(window_spec)).filter("rank <= 2")
top_2_employees.show()
#  Date Operations
#  7. Filter entries where  WorkDate falls on a weekend.
empdf.filter((dayofweek(col("WorkDate"))==1) | (dayofweek(col("WorkDate"))==7)).show()
#  8. Calculate running total of hours per employee using window.
window_spec_emp= Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_running_total = empdf.withColumn("RunningTotalHours", sum("WorkHours").over(window_spec_emp))
df_running_total.select("EmployeeID", "WorkDate", "WorkHours", "RunningTotalHours").show()
#  Joining DataFrames
#  9. Create 
# department_location.csv :
#  Department,DeptHead 
# IT,Anand 
# HR,Shruti 
# Finance,Kamal
deptloc= spark.createDataFrame([
    ("IT", "Anand"),
    ("HR", "Shruti"),
    ("Finance", "Kamal"),
], ["Department", "DeptHead"])
#  10. Join with timesheet data and list all employees with their DeptHead.
timesheet_with_head = empdf.join(deptloc, on="Department", how="left")
timesheet_with_head.select("EmployeeID", "Name", "Department", "DeptHead", "Project", "WorkHours").show()
#  Pivot & Unpivot
#  11. Pivot table: total hours per employee per project.
pivot_df = empdf.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours"))
pivot_df.show()
#  12. Unpivot example: Convert mode-specific hours into rows.
data = [
    ("E101", 12, 4),
    ("E102", 6, 9),
    ("E103", 10, 0),
]

columns = ["EmployeeID", "RemoteHours", "OnsiteHours"]

wide_df = spark.createDataFrame(data, columns)
wide_df.show()

unpivoted = wide_df.selectExpr("EmployeeID",
    "stack(2, 'Remote', RemoteHours, 'Onsite', OnsiteHours) as (Mode, Hours)"
)
unpivoted.show()

#  UDF & Conditional Logic
#  13. Create a UDF to classify work hours:
# def workload_tag(hours): 
# if hours >= 8: return "Full" 
# elif hours >= 4: return "Partial" 
# else: return "Light"
#  14. Add a column 
# WorkloadCategory using this UDF.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

workload_udf = udf(workload_tag, StringType())
df_with_workload = empdf.withColumn("WorkloadCategory", workload_udf(empdf["WorkHours"]))
df_with_workload.select("EmployeeID", "WorkHours", "WorkloadCategory").show()

#  Nulls and Cleanup
#  15. Introduce some nulls in Mode column.
# Introduce nulls: randomly nullify Mode for some employees
empdf.show()
empdf = empdf.withColumnRenamed("Mode ", "Mode")
null_mode = empdf.withColumn("Mode",when(col("EmployeeID") == "E102", None).otherwise(col("Mode")))
null_mode.select("EmployeeID", "Mode").show()
#  16. Fill nulls with "Not Provided".
filled_mode = null_mode.fillna({"Mode": "Not Provided"})
filled_mode.select("EmployeeID", "Mode").show()
#  17. Drop rows where 
# WorkHours < 4.
filtered_hours = filled_mode.filter(filled_mode["WorkHours"] >= 4)
filtered_hours.select("EmployeeID", "WorkHours").show()
#  Advanced Conditions
#  18. Use 
# when-otherwise to mark employees as "Remote Worker" if >80% entries are
#  Remote.
from pyspark.sql.functions import col, count, when, expr

# Step 1: Count total and remote entries per employee
remote_ratio_df = empdf.groupBy("EmployeeID").agg(
    count("*").alias("total_entries"),
    count(when(col("Mode") == "Remote", True)).alias("remote_entries")
)

remote_ratio_df = remote_ratio_df.withColumn(
    "remote_percent",
    (col("remote_entries") / col("total_entries")) * 100
)

remote_ratio_df = remote_ratio_df.withColumn(
    "RemoteWorker",
    when(col("remote_percent") > 80, "Yes").otherwise("No")
)

remote_ratio_df.select("EmployeeID", "remote_percent", "RemoteWorker").show()

#  19. Add a new column 
# ExtraHours where hours > 8.
extra_hours_df = empdf.withColumn(
    "ExtraHours",
    when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0)
)
#  Union + Duplicate Handling
#  20. Append a dummy timesheet for new interns using unionByName() 
from pyspark.sql import Row

intern_data = [
    Row(EmployeeID="E201", Name="Ria", Department="Intern", Project="Delta", WorkHours=6,
        WorkDate="2024-05-05", Location="Chennai", Mode ="Remote"),
    Row(EmployeeID="E202", Name="Arjun", Department="Intern", Project="Delta", WorkHours=5,
        WorkDate="2024-05-05", Location="Kolkata", Mode="Onsite")
]

intern_df = spark.createDataFrame(intern_data)

combined_df = empdf.unionByName(intern_df, allowMissingColumns=True)

dedup_df = combined_df.dropDuplicates()

dedup_df.show()

# 21. Remove duplicate rows based on all columns.
dedup_by_subset = combined_df.dropDuplicates(["EmployeeID", "WorkDate"])


+----------+-----+----------+-------+---------+----------+---------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode |
+----------+-----+----------+-------+---------+----------+---------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote |
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite |
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote |
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote |
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite |
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai| Remote|
+----------+-----+----------+-------+---------+----------+---------+-------+

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nu