Creating Session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Employee Attendance Tracker") \
                    .getOrCreate() 
spark

Loading data

In [0]:
employee_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load("/Volumes/workspace/default/employee_attendance/employees.csv")

attendance_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load("/Volumes/workspace/default/employee_attendance/attendance.csv")

tasks_df = spark.read.format("csv") \
                     .option("header", "true") \
                     .option("inferSchema", "true") \
                     .load("/Volumes/workspace/default/employee_attendance/tasks.csv")

Displaying sample data

In [0]:
print("\n Employee Record")
employee_df.show(5)

print("\n Attendance Record")
attendance_df.show(5)

print("\n Tasks Record")
tasks_df.show(5)



 Employee Record
+-----------+-------------+----------+
|employee_id|employee_name|department|
+-----------+-------------+----------+
|          1|       Ashwin|        IT|
|          2|      Aravind|        HR|
|          3|     Akhilesh|   Finance|
|          4|         Neha|        IT|
|          5|        Rahul|        HR|
+-----------+-------------+----------+
only showing top 5 rows

 Attendance Record
+-----------+-------------------+-------------------+
|employee_id|           clock_in|          clock_out|
+-----------+-------------------+-------------------+
|          1|2025-06-01 09:00:00|2025-06-01 17:00:00|
|          2|2025-06-01 09:15:00|2025-06-01 17:15:00|
|          3|2025-06-01 08:45:00|2025-06-01 16:45:00|
|          4|2025-06-01 09:00:00|2025-06-01 17:00:00|
|          5|2025-06-01 09:05:00|2025-06-01 17:05:00|
+-----------+-------------------+-------------------+
only showing top 5 rows

 Tasks Record
+-----------+--------------------+----------+
|employee_id|   

Adding working hours column

In [0]:
from pyspark.sql.functions import unix_timestamp, round

attendance_df = attendance_df.withColumn("work_hours", 
                                            round((unix_timestamp("clock_out") - unix_timestamp("clock_in")) / 3600, 2)
                                        )
attendance_df.select(
                        "employee_id",
                        "work_hours"
                    ).show()




+-----------+----------+
|employee_id|work_hours|
+-----------+----------+
|          1|       8.0|
|          2|       8.0|
|          3|       8.0|
|          4|       8.0|
|          5|       8.0|
|          6|       8.0|
|          7|       8.0|
|          8|       8.0|
|          9|       8.0|
|         10|       8.0|
|         11|       8.0|
|         12|       8.0|
|         13|       8.0|
|         14|       8.0|
|         15|       8.0|
|         16|       8.0|
|         17|       8.0|
|         18|       8.0|
|         19|       8.0|
|         20|       8.0|
+-----------+----------+



Joining employee and task

In [0]:
emp_task_df = tasks_df.join(employee_df, on="employee_id", how="inner") \
                      .select(
                                "employee_id",
                                "employee_name",
                                "department",
                                "task_description",
                                "task_date",
                             )
emp_task_df.show()

+-----------+-------------+----------+--------------------+----------+
|employee_id|employee_name|department|    task_description| task_date|
+-----------+-------------+----------+--------------------+----------+
|          1|       Ashwin|        IT|Prepare project r...|2025-06-01|
|          2|      Aravind|        HR|  Conduct interviews|2025-06-01|
|          3|     Akhilesh|   Finance|Review financial ...|2025-06-01|
|          4|         Neha|        IT|   Develop feature X|2025-06-01|
|          5|        Rahul|        HR|Organize team mee...|2025-06-01|
|          6|        Pooja|   Finance|Update documentation|2025-06-01|
|          7|       Suresh|        IT|Fix bugs in module Y|2025-06-01|
|          8|        Anita|        HR|Plan training ses...|2025-06-01|
|          9|       Vikram|   Finance|      Audit accounts|2025-06-01|
|         10|        Rohit|        IT|  Deploy new release|2025-06-01|
|         11|        Sneha|        HR|Coordinate client...|2025-06-01|
|     

Joining employee and attendance

In [0]:
emp_attendance_df = attendance_df.join(employee_df, on="employee_id", how="inner") \
                                 .select(
                                            "employee_id",
                                            "employee_name",
                                            "department",
                                            "clock_in",
                                            "clock_out",
                                            "work_hours"
                                        )
emp_attendance_df.show()

+-----------+-------------+----------+-------------------+-------------------+----------+
|employee_id|employee_name|department|           clock_in|          clock_out|work_hours|
+-----------+-------------+----------+-------------------+-------------------+----------+
|          1|       Ashwin|        IT|2025-06-01 09:00:00|2025-06-01 17:00:00|       8.0|
|          2|      Aravind|        HR|2025-06-01 09:15:00|2025-06-01 17:15:00|       8.0|
|          3|     Akhilesh|   Finance|2025-06-01 08:45:00|2025-06-01 16:45:00|       8.0|
|          4|         Neha|        IT|2025-06-01 09:00:00|2025-06-01 17:00:00|       8.0|
|          5|        Rahul|        HR|2025-06-01 09:05:00|2025-06-01 17:05:00|       8.0|
|          6|        Pooja|   Finance|2025-06-01 09:10:00|2025-06-01 17:10:00|       8.0|
|          7|       Suresh|        IT|2025-06-01 09:00:00|2025-06-01 17:00:00|       8.0|
|          8|        Anita|        HR|2025-06-01 09:20:00|2025-06-01 17:20:00|       8.0|
|         

#### Creating department-level metrics by total work hours and number of tasks

Total work hours per department

In [0]:
from pyspark.sql.functions import count, avg
attendance_metrice = emp_attendance_df.groupBy("department") \
                                      .agg(count("employee_id").alias("attendance_count"),
                                           round(avg("work_hours"), 2).alias("avg_work_hours"))
                                      
attendance_metrice.show()

+----------+----------------+--------------+
|department|attendance_count|avg_work_hours|
+----------+----------------+--------------+
|        HR|               7|           8.0|
|        IT|               7|           8.0|
|   Finance|               6|           8.0|
+----------+----------------+--------------+



Total tasks per department

In [0]:
task_metrice = emp_task_df.groupBy("department") \
                          .agg(count("task_description").alias("total_tasks"))
task_metrice.show()

+----------+-----------+
|department|total_tasks|
+----------+-----------+
|        HR|          7|
|        IT|          7|
|   Finance|          6|
+----------+-----------+



Joining two metrices

In [0]:
dept_metrics_df = attendance_metrice.join(task_metrice, on="department", how="outer")
dept_metrics_df.show()

+----------+----------------+--------------+-----------+
|department|attendance_count|avg_work_hours|total_tasks|
+----------+----------------+--------------+-----------+
|        HR|               7|           8.0|          7|
|        IT|               7|           8.0|          7|
|   Finance|               6|           8.0|          6|
+----------+----------------+--------------+-----------+



#### Saving Files

saving in delta format

In [0]:
dept_metrics_df.write.format("delta") \
                     .mode("overwrite") \
                     .save("/Volumes/workspace/default/employee_attendance/employees_delta")

saving in csv format

In [0]:
dept_metrics_df.write.format("csv") \
                     .mode("overwrite") \
                     .save("/Volumes/workspace/default/employee_attendance/employees_csv.csv")

In [0]:
display(dbutils.fs.ls("/Volumes/workspace/default/employee_attendance/"))

path,name,size,modificationTime
dbfs:/Volumes/workspace/default/employee_attendance/attendance.csv,attendance.csv,903,1749796787000
dbfs:/Volumes/workspace/default/employee_attendance/employees.csv,employees.csv,328,1749796787000
dbfs:/Volumes/workspace/default/employee_attendance/employees_csv.csv/,employees_csv.csv/,0,1749798200520
dbfs:/Volumes/workspace/default/employee_attendance/employees_delta.csv/,employees_delta.csv/,0,1749798200520
dbfs:/Volumes/workspace/default/employee_attendance/tasks.csv,tasks.csv,732,1749796787000
