In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Ingestion & Exploration

In [7]:
# Read all 3 files (CSV + JSON) using PySpark.
employees_df=spark.read.csv('/content/drive/MyDrive/employees.csv',header=True,inferSchema=True)
attendance_df=spark.read.csv('/content/drive/MyDrive/attendance.csv',header=True,inferSchema=True)
bonuses_df=spark.read.json('/content/drive/MyDrive/bonuses.json',multiLine=True)

In [9]:
# Show schemas and sample records.
employees_df.printSchema()
employees_df.show()
attendance_df.printSchema()
attendance_df.show()
bonuses_df.printSchema()
bonuses_df.show()

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-

In [10]:
# Count distinct departments.
employees_df.select('department').distinct().count()

3

2. DataFrame Operations

In [11]:
# Add a column TenureYears using datediff() and round() .
from pyspark.sql.functions import datediff, current_date, round

employees_df=employees_df.withColumn("TenureYears",round(datediff(current_date(),"JoinDate")/365.0,2))
employees_df.show()


+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



In [13]:
# Calculate TotalCompensation = Salary + Bonus.
from pyspark.sql.functions import coalesce,lit

bonus_df=employees_df.join(bonuses_df,on="EmpID",how="left")
bonus_df=bonus_df.withColumn("TotalCompensation",bonus_df["Salary"]+coalesce(bonus_df["Bonus"],lit(0)))
bonus_df.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [14]:
# Filter employees with more than 2 years in the company.
bonus_df.filter(bonus_df["TenureYears"]>2).show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [16]:
# Show employees who report to a manager ( ManagerID is not null ).
bonus_df.filter(bonus_df["ManagerID"].isNotNull()).select("EmpID","Name","ManagerID").show()

+-----+------+---------+
|EmpID|  Name|ManagerID|
+-----+------+---------+
|    2|   Raj|        1|
|    3|Simran|        1|
|    4| Aamir|        1|
|    5| Nisha|        1|
+-----+------+---------+



3. Aggregation Tasks

In [17]:
# Average salary per department.
employees_df.groupBy("Department").avg("Salary").withColumnRenamed("avg(Salary)", "AvgSalary").show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+



In [18]:
# Number of employees under each manager.
employees_df.groupBy("ManagerID").count().filter("ManagerID IS NOT NULL").show()

+---------+-----+
|ManagerID|count|
+---------+-----+
|        1|    4|
+---------+-----+



In [19]:
# Count of absences per employee.
from pyspark.sql.functions import col
attendance_df.filter(col("Status") == "Absent").groupBy("EmpID").count().show()

+-----+-----+
|EmpID|count|
+-----+-----+
|    4|    2|
|    2|    1|
+-----+-----+



4.Join Tasks

In [20]:
# Join employees and attendance → Get attendance % (Present days / Total days).
from pyspark.sql.functions import count,sum,col,round

attendance_stats = attendance_df.withColumn("PresentFlag",(col("Status")=="Present").cast("int")).groupBy("EmpID")\
      .agg(count("Status").alias("TotalDays"),sum("PresentFlag").alias("PresentDays")) \
    .withColumn("AttendancePercent",round((col("PresentDays")/col("TotalDays"))*100,2))
employees_attendance=employees_df.join(attendance_stats,on="EmpID",how="left")
employees_attendance.select("EmpID","Name","PresentDays","TotalDays","AttendancePercent").show()


+-----+------+-----------+---------+-----------------+
|EmpID|  Name|PresentDays|TotalDays|AttendancePercent|
+-----+------+-----------+---------+-----------------+
|    1| Anita|          2|        2|            100.0|
|    2|   Raj|          1|        2|             50.0|
|    3|Simran|          2|        2|            100.0|
|    4| Aamir|          0|        2|              0.0|
|    5| Nisha|          2|        2|            100.0|
+-----+------+-----------+---------+-----------------+



In [21]:
# Join employees and bonuses → Show top 3 employees by TotalCompensation.
from pyspark.sql.functions import coalesce

bonus_df=employees_df.join(bonuses_df,on="EmpID",how="left").withColumn("Bonus",coalesce(col("Bonus"),lit(0))).withColumn("TotalCompensation",col("Salary")+col("Bonus"))
bonus_df.orderBy(col("TotalCompensation").desc()).select("EmpID", "Name", "TotalCompensation").show()


+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
|    1| Anita|            60000|
|    5| Nisha|            54000|
+-----+------+-----------------+



In [22]:
# Multi-level join: employees + bonuses + attendance .
multijoin_df=employees_df.join(bonuses_df,on="EmpID",how="left").join(attendance_stats,on="EmpID",how="left")
multijoin_df.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+---------+-----------+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalDays|PresentDays|AttendancePercent|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------+-----------+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|        2|          2|            100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|        2|          1|             50.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|        2|          2|            100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|        2|          0|              0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|        2|          2|            100.0|
+-----+------+-----------+----------+------+---------+--

5. String & Date Functions

In [23]:
# Extract year and month from JoinDate .
from pyspark.sql.functions import year,month

employees_df=employees_df.withColumn("JoinYear",year("JoinDate")).withColumn("JoinMonth",month("JoinDate"))
employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|    2021|        5|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|    2020|        3|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|    2019|       11|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+



In [31]:
# Mask employee names using regex.
from pyspark.sql.functions import regexp_replace

employees_df=employees_df.withColumn("MaskedName",regexp_replace("Name","[a-zA-Z]","*"))
employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|    2021|        5|     *****|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|    2020|        3|       ***|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7|    ******|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|    2019|       11|     *****|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1|     *****|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+



In [32]:
# Use substring() to create EmpCode like "EMP001".
from pyspark.sql.functions import lpad,concat,lit
employees_df.withColumn("EmpCode", concat(lit("EMP"),lpad(col("EmpID").cast("string"),3,"0"))).select("EmpID", "EmpCode").show()

+-----+-------+
|EmpID|EmpCode|
+-----+-------+
|    1| EMP001|
|    2| EMP002|
|    3| EMP003|
|    4| EMP004|
|    5| EMP005|
+-----+-------+



6. Conditional & Null Handling


In [33]:
# Use when/otherwise to label performance:
# “High” if Bonus > 6000
# “Medium” if 4000–6000
# “Low” otherwise
from pyspark.sql.functions import when
performance_df=bonus_df.withColumn("Performance",when(col("Bonus")>6000,"High").when((col("Bonus")>=4000) & (col("Bonus")<=6000),"Medium").otherwise("Low"))
performance_df.show()


+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|Performance|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|     Medium|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|       High|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|       High|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|     Medium|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|     Medium|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+-----------+



In [39]:
# Handle missing ManagerID using fillna("No Manager") .
employees_df_fill=employees_df.withColumn("ManagerID",when(col("ManagerID").isNull(),"No Manager").otherwise(col("ManagerID").cast("string")))
employees_df_fill.show()

+-----+------+-----------+----------+------+----------+-----------+--------+---------+----------+
|EmpID|  Name| Department|  JoinDate|Salary| ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|
+-----+------+-----------+----------+------+----------+-----------+--------+---------+----------+
|    1| Anita|         HR|2021-05-01| 55000|No Manager|       4.11|    2021|        5|     *****|
|    2|   Raj|Engineering|2020-03-15| 80000|         1|       5.24|    2020|        3|       ***|
|    3|Simran|Engineering|2022-07-10| 75000|         1|       2.92|    2022|        7|    ******|
|    4| Aamir|  Marketing|2019-11-20| 60000|         1|       5.56|    2019|       11|     *****|
|    5| Nisha|         HR|2023-01-05| 50000|         1|       2.43|    2023|        1|     *****|
+-----+------+-----------+----------+------+----------+-----------+--------+---------+----------+



7. Spark SQL


In [41]:
# Create and use database hr.
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

DataFrame[]

In [43]:
# Save all DataFrames as tables: employees , attendance , bonuses .
employees_df.write.mode("overwrite").saveAsTable("hr.employees")
attendance_df.write.mode("overwrite").saveAsTable("hr.attendance")
bonuses_df.write.mode("overwrite").saveAsTable("hr.bonuses")

In [44]:
# Write SQL queries:
# Top paid employee in each department.
spark.sql("""SELECT * FROM hr.employees WHERE (Department, Salary)
          IN (SELECT Department, MAX(Salary) FROM hr.employees
          GROUP BY Department)""").show()

+-----+-----+-----------+----------+------+---------+-----------+--------+---------+----------+
|EmpID| Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|
+-----+-----+-----------+----------+------+---------+-----------+--------+---------+----------+
|    1|Anita|         HR|2021-05-01| 55000|     NULL|       4.11|    2021|        5|     *****|
|    2|  Raj|Engineering|2020-03-15| 80000|        1|       5.24|    2020|        3|       ***|
|    4|Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|    2019|       11|     *****|
+-----+-----+-----------+----------+------+---------+-----------+--------+---------+----------+



In [48]:
# Write SQL queries:
# Attendance rate by department.
spark.sql("""SELECT e.Department,ROUND(COUNT(CASE WHEN a.Status='Present' THEN 1 END)*1.0/COUNT(*)) AS AttendanceRate
          FROM hr.employees e
          JOIN hr.attendance a ON e.EmpID = a.EmpID
          GROUP BY e.Department
          """).show()

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|             1|
|         HR|             1|
|  Marketing|             0|
+-----------+--------------+



In [50]:
# Write SQL queries:
# Employees joined after 2021 with salary > 70,000.
spark.sql("""SELECT * FROM hr.employees
          WHERE YEAR(JoinDate) > 2021 AND Salary > 70000""").show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7|    ******|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+



8. Advanced (Optional)


In [51]:
# Use a UDF to classify department as "Tech" vs "Non-Tech".
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify_dept(dept):
    if dept == "Engineering":
      return "Tech"
    else:
      return "Non-Tech"
dept_udf = udf(classify_dept, StringType())
classified_df = employees_df.withColumn("DeptType", dept_udf(col("Department")))
classified_df.show()


+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+--------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|DeptType|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+--------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|    2021|        5|     *****|Non-Tech|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|    2020|        3|       ***|    Tech|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7|    ******|    Tech|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|    2019|       11|     *****|Non-Tech|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1|     *****|Non-Tech|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+--------+



In [53]:
# Create a view emp_attendance_summary .
attendance_stats.createOrReplaceTempView("emp_attendance_summary")
spark.sql("SELECT * FROM emp_attendance_summary").show()

+-----+---------+-----------+-----------------+
|EmpID|TotalDays|PresentDays|AttendancePercent|
+-----+---------+-----------+-----------------+
|    1|        2|          2|            100.0|
|    3|        2|          2|            100.0|
|    5|        2|          2|            100.0|
|    4|        2|          0|              0.0|
|    2|        2|          1|             50.0|
+-----+---------+-----------+-----------------+



In [56]:
# Save it as Parquet partitioned by Department .
summary_df=attendance_stats.join(employees_df.select("EmpID", "Department"),on="EmpID",how="left")
summary_df.write.mode("overwrite").partitionBy("Department").parquet("/contents/drive/MyDrive/emp_attendance_summary_parquet")
parquet_df=spark.read.parquet("/contents/drive/MyDrive/emp_attendance_summary_parquet")
parquet_df.printSchema()
parquet_df.show()


root
 |-- EmpID: integer (nullable = true)
 |-- TotalDays: long (nullable = true)
 |-- PresentDays: long (nullable = true)
 |-- AttendancePercent: double (nullable = true)
 |-- Department: string (nullable = true)

+-----+---------+-----------+-----------------+-----------+
|EmpID|TotalDays|PresentDays|AttendancePercent| Department|
+-----+---------+-----------+-----------------+-----------+
|    1|        2|          2|            100.0|         HR|
|    5|        2|          2|            100.0|         HR|
|    3|        2|          2|            100.0|Engineering|
|    2|        2|          1|             50.0|Engineering|
|    4|        2|          0|              0.0|  Marketing|
+-----+---------+-----------+-----------------+-----------+

