Importing Packages and Creating a new spark session

In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, round, max, date_format, to_date, to_timestamp, col, count, coalesce, lit
from google.colab import drive

Creating Session

In [None]:
spark = SparkSession.builder \
                    .appName("Employee Attendance Tracker") \
                    .getOrCreate()
spark

Reading CSV File from Google Drive

In [36]:
drive.mount('/content/drive')
attendance_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/Employee/attendance.csv', header=True, inferSchema=True)
employee_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/Employee/employees.csv', header=True, inferSchema=True)
task_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/Employee/tasks.csv', header=True, inferSchema=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
attendance_df.show()
employee_df.show()
task_df.show()

+-----------+-------------------+-------------------+-------------------+-------------------+
|employee_id|           clock_in|          clock_out|        break_start|          break_end|
+-----------+-------------------+-------------------+-------------------+-------------------+
|          1|2025-06-01 09:00:00|2025-06-01 17:30:00|2025-06-01 13:00:00|2025-06-01 13:30:00|
|          1|2025-06-02 09:15:00|2025-06-02 18:00:00|2025-06-02 13:15:00|2025-06-02 13:45:00|
|          2|2025-06-01 09:20:00|2025-06-01 17:00:00|2025-06-01 13:10:00|2025-06-01 13:40:00|
|          2|2025-06-02 09:00:00|2025-06-02 16:50:00|2025-06-02 12:50:00|2025-06-02 13:20:00|
|          3|2025-06-01 08:50:00|2025-06-01 16:40:00|2025-06-01 12:50:00|2025-06-01 13:15:00|
|          3|2025-06-02 09:10:00|2025-06-02 17:10:00|2025-06-02 13:00:00|2025-06-02 13:35:00|
|          4|2025-06-01 09:00:00|2025-06-01 17:15:00|2025-06-01 13:00:00|2025-06-01 13:25:00|
|          4|2025-06-02 08:45:00|2025-06-02 17:00:00|2025-06

Converting  String to timestamp data type

In [None]:
attendance_df = attendance_df.withColumn("clock_in", col("clock_in").cast("timestamp")) \
                             .withColumn("clock_out", col("clock_out").cast("timestamp"))

Extracting date and time from datetime timestamp

In [None]:
attendance_df = attendance_df.withColumn("login_time", date_format("clock_in", "HH:mm:ss")) \
                             .withColumn("login_date", to_date("clock_in"))

Filter for late logins

In [None]:
late_logins_df = attendance_df.filter(col("login_time") > "09:00:00") \
                              .select("employee_id", "login_time", "login_date")

late_logins_df.show()

+-----------+----------+----------+
|employee_id|login_time|login_date|
+-----------+----------+----------+
|          1|  09:15:00|2025-06-02|
|          2|  09:20:00|2025-06-01|
|          3|  09:10:00|2025-06-02|
|          5|  09:30:00|2025-06-01|
|          5|  09:15:00|2025-06-02|
|          6|  09:10:00|2025-06-01|
|          6|  09:20:00|2025-06-02|
|          7|  09:05:00|2025-06-01|
|          7|  09:10:00|2025-06-02|
|          8|  09:15:00|2025-06-02|
|          9|  09:05:00|2025-06-02|
|         10|  09:20:00|2025-06-02|
+-----------+----------+----------+



Filtering for absences


In [None]:
# Creating new data frame for employees and dates
employees_df = employee_df.select("employee_id").distinct()
attendance_df = attendance_df.withColumn("login_date", to_date("clock_in"))
dates_df = attendance_df.select("login_date").distinct()

# Getting all combinations of employees and dates
employee_date_df = employees_df.crossJoin(dates_df)

# creating new data frame for employee who made attendance
actual_attendance_df = attendance_df.select("employee_id", "login_date").distinct()

# Joining combinations of employees and dates and with actual attendance
absentees_df = employee_date_df.join(
    actual_attendance_df,
    on=["employee_id", "login_date"],
    how="left_anti"
)

# Joining absentees table with employee table
absentees_df = absentees_df.withColumnRenamed("login_date", "absent_date") \
                           .join(employee_df, on="employee_id", how="left") \
                           .select("employee_id", "employee_name", "department", "absent_date")

# Displays the null records -> Absentees
absentees_df.show()

+-----------+-------------+----------+-----------+
|employee_id|employee_name|department|absent_date|
+-----------+-------------+----------+-----------+
|         12|        Karan|   Finance| 2025-06-02|
|         13|        Deepa|        IT| 2025-06-02|
|         16|        Manoj|        IT| 2025-06-02|
|         20|        Vikas|        HR| 2025-06-02|
|         19|       Sunita|        IT| 2025-06-02|
|         15|        Priya|   Finance| 2025-06-02|
|         17|       Kavita|        HR| 2025-06-02|
|         11|        Sneha|        HR| 2025-06-02|
|         14|       Ramesh|        HR| 2025-06-02|
|         18|         Ajay|   Finance| 2025-06-02|
|         12|        Karan|   Finance| 2025-06-01|
|         13|        Deepa|        IT| 2025-06-01|
|         16|        Manoj|        IT| 2025-06-01|
|         20|        Vikas|        HR| 2025-06-01|
|         19|       Sunita|        IT| 2025-06-01|
|         15|        Priya|   Finance| 2025-06-01|
|         17|       Kavita|    

 Group by department to get average work hours and productivity



In [43]:
# Adding working hours column
attendance_df = attendance_df.withColumn(
    "work_hours",
    ((col("clock_out").cast("long") - col("clock_in").cast("long")) - (col("break_end").cast("long") - col("break_start").cast("long"))) / 3600.0
)

# Adding work date column
attendance_df = attendance_df.withColumn("work_date", to_date(col("clock_in")))

# Calculating productivity
productivity_df = task_df.groupBy("employee_id", "task_date").count().withColumnRenamed("count", "tasks_done")

# Joining attendance and productivity table
attendance_productivity_df = attendance_df.join(
    productivity_df,
    (attendance_df.employee_id == productivity_df.employee_id) &
    (attendance_df.work_date == productivity_df.task_date),
     how="left") \
    .select(
    attendance_df.employee_id,
    attendance_df.work_hours,
    attendance_df.work_date,
    coalesce(productivity_df.tasks_done, lit(0)).alias("tasks_done")
    )


# joining attendance and employee table
employee_attendance_df = attendance_productivity_df.join(employee_df, on="employee_id", how="left")

# grouping by department and average work hours and average productivity
result_df = employee_attendance_df.groupBy("department") \
                   .agg(
                       round(avg("work_hours"),2).alias("avg_work_hours"),
                       avg("tasks_done").alias("avg_productivity")
                   )

result_df.show()


+----------+--------------+----------------+
|department|avg_work_hours|avg_productivity|
+----------+--------------+----------------+
|        HR|          7.68|             0.5|
|   Finance|          7.54|             0.5|
|        IT|          7.95|             0.5|
+----------+--------------+----------------+

