In [2]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_attendance.csv to cleaned_attendance.csv
Saving cleaned_employee.csv to cleaned_employee.csv
Saving cleaned_task.csv to cleaned_task.csv


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, hour, to_timestamp

spark = SparkSession.builder.appName("EmployeeAttendance").getOrCreate()

attendance_df = spark.read.csv("cleaned_attendance.csv", header=True, inferSchema=True)
task_df = spark.read.csv("cleaned_task.csv", header=True, inferSchema=True)
employee_df = spark.read.csv("cleaned_employee.csv", header=True, inferSchema=True)
attendance_df.show()
task_df.show()
employee_df.show()

+-------------+-----------+---------------+-------------------+-------------------+
|attendance_id|employee_id|attendance_date|           clock_in|          clock_out|
+-------------+-----------+---------------+-------------------+-------------------+
|         1001|        101|     2023-07-01|2025-08-01 09:00:00|2025-08-01 17:00:00|
|         1002|        102|     2023-07-01|2025-08-01 09:15:00|2025-08-01 17:15:00|
|         1003|        103|     2024-01-15|2025-08-01 08:45:00|2025-08-01 16:45:00|
|         1004|        104|     2024-01-15|2025-08-01 09:10:00|2025-08-01 17:00:00|
|         1005|        105|     2025-03-10|2025-08-01 09:00:00|2025-08-01 18:00:00|
|         1006|        106|     2025-03-10|2025-08-01 09:30:00|2025-08-01 17:30:00|
|         1007|        107|     2023-12-05|2025-08-01 08:50:00|2025-08-01 16:50:00|
|         2001|        101|     2024-06-01|2025-08-01 09:00:00|2025-08-01 17:00:00|
|         2002|        102|     2024-06-02|2025-08-01 09:00:00|2025-08-01 17

In [4]:
from pyspark.sql.functions import hour
late_logins = attendance_df.filter(hour("clock_in") >8)
late_logins.select("employee_id", "attendance_date", "clock_in").show()

+-----------+---------------+-------------------+
|employee_id|attendance_date|           clock_in|
+-----------+---------------+-------------------+
|        101|     2023-07-01|2025-08-01 09:00:00|
|        102|     2023-07-01|2025-08-01 09:15:00|
|        104|     2024-01-15|2025-08-01 09:10:00|
|        105|     2025-03-10|2025-08-01 09:00:00|
|        106|     2025-03-10|2025-08-01 09:30:00|
|        101|     2024-06-01|2025-08-01 09:00:00|
|        102|     2024-06-02|2025-08-01 09:00:00|
|        103|     2024-06-03|2025-08-01 09:00:00|
|        104|     2024-06-04|2025-08-01 09:15:00|
|        105|     2024-06-05|2025-08-01 09:00:00|
+-----------+---------------+-------------------+



In [5]:
absentees = attendance_df.filter((attendance_df["clock_in"].isNull()) | (attendance_df["clock_out"].isNull()))
absentees.select("employee_id", "attendance_date").show()

+-----------+---------------+
|employee_id|attendance_date|
+-----------+---------------+
+-----------+---------------+



In [6]:
from pyspark.sql.functions import (col, when)
attendance_df = attendance_df.withColumn("work_hours", (col("clock_out").cast("long") - col("clock_in").cast("long")) / 3600)

task_df = task_df.withColumn("tasks_completed", when(col("status") == "Completed", 1).otherwise(0))

combined_df = attendance_df.join(task_df, on="employee_id", how="left").join(employee_df, on="employee_id", how="left")

combined_df = combined_df.fillna({"tasks_completed": 0, "work_hours": 0})

In [7]:
from pyspark.sql.functions import when
combined_df = combined_df.withColumn(
    "productivity_score", when(col("work_hours") > 0, col("tasks_completed") / col("work_hours")).otherwise(0)
).withColumn(
    "break_time",when(col("work_hours") > 6, 1).otherwise(0)
)


In [8]:
from pyspark.sql.functions import avg
department_summary = combined_df.groupBy("department").agg(
    avg("work_hours").alias("avg_work_hours"),
    avg("productivity_score").alias("avg_productivity_score")
)

department_summary = department_summary.select(
    col("department"),
    col("avg_work_hours").cast("decimal(5,2)").alias("avg_work_hours"),
    col("avg_productivity_score").cast("decimal(5,2)").alias("avg_productivity_score")
)

department_summary.show()

+----------+--------------+----------------------+
|department|avg_work_hours|avg_productivity_score|
+----------+--------------+----------------------+
|        HR|          8.00|                  0.08|
|   Finance|          8.00|                  0.03|
| Marketing|          7.96|                  0.06|
|        IT|          8.50|                  0.09|
+----------+--------------+----------------------+



In [None]:
from google.colab import files
department_summary_pd = department_summary.toPandas()
department_summary_pd.to_csv("Department_summary.csv", index=False)
files.download("Department_summary.csv")

In [None]:
late_logins_pd = late_logins.select("employee_id", "attendance_date", "clock_in").toPandas()
late_logins_pd.to_csv("Attendance_Issue.csv", index=False)
files.download("Attendance_Issue.csv")