**Intialize the Saprk Session**

In [84]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("Master-Task-Set2")\
    .getOrCreate()
)
spark

**Ingestion & Exploration**

In [85]:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, DateType, FloatType, StructType, StructField
#Load data
employees = spark.read.option("header", True).option("inferSchema", True).csv("/content/employees.csv")
attendance = spark.read.option("header", True).option("inferSchema", True).csv("/content/attendance.csv")
bonuses = spark.read.option("multiline", True).json("/content/bonuses.json")
#Show schemas and data
employees.printSchema()
attendance.printSchema()
bonuses.printSchema()
#Display the data
employees.show()
attendance.show()
bonuses.show()
#Distinct departments
employees.select("Department").distinct().show()


root
 |--  EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID : string (nullable = true)

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status : string (nullable = true)

root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)

+------+------+-----------+----------+------+----------+
| EmpID|  Name| Department|  JoinDate|Salary|ManagerID |
+------+------+-----------+----------+------+----------+
|     1| Anita|         HR|2021-05-01| 55000|          |
|     2|   Raj|Engineering|2020-03-15| 80000|        1 |
|     3|Simran|Engineering|2022-07-10| 75000|        1 |
|     4| Aamir|  Marketing|2019-11-20| 60000|        1 |
|     5| Nisha|         HR|2023-01-05| 50000|         1|
+------+------+-----------+----------+------+----------+

+-----+----------

**DataFrame Operations**

In [86]:
#1.Add a column TenureYears using datediff() and round()
employees = employees.withColumn("JoinDate", to_date(col("JoinDate")))
employees = employees.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate")) / 365, 2))
employees.show()
#2.Calculate TotalCompensation = Salary + Bonus
# Rename the column to remove the leading space before joining
employees = employees.withColumnRenamed(" EmpID", "EmpID")
emp_bonus = employees.join(bonuses, "EmpID").withColumn("TotalCompensation", col("Salary") + col("Bonus"))
#3.Filter employees with more than 2 years in the company
emp_bonus.filter(col("TenureYears") > 2).show()
#4.Show employees who report to a manager (ManagerID is not null)
# It also appears ManagerID has a trailing space, let's fix that too
employees = employees.withColumnRenamed("ManagerID ", "ManagerID")
employees.filter(col("ManagerID").isNotNull()).show()

+------+------+-----------+----------+------+----------+-----------+
| EmpID|  Name| Department|  JoinDate|Salary|ManagerID |TenureYears|
+------+------+-----------+----------+------+----------+-----------+
|     1| Anita|         HR|2021-05-01| 55000|          |       4.11|
|     2|   Raj|Engineering|2020-03-15| 80000|        1 |       5.24|
|     3|Simran|Engineering|2022-07-10| 75000|        1 |       2.92|
|     4| Aamir|  Marketing|2019-11-20| 60000|        1 |       5.56|
|     5| Nisha|         HR|2023-01-05| 50000|         1|       2.43|
+------+------+-----------+----------+------+----------+-----------+

+-----+------+-----------+----------+------+----------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID |TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+----------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|          |       4.11| 5000|2023|         

**Aggregation**

In [87]:
#1.Average salary per department
employees.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()
#2.Number of employees under each manager
employees.groupBy("ManagerID").count().withColumnRenamed("count", "NumEmployees").show()
#3.Count of absences per employee
attendance = attendance.withColumnRenamed("Status ", "Status")
attendance.filter(col("Status") == "Absent").groupBy("EmpID").count().withColumnRenamed("count", "Absences").show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+

+---------+------------+
|ManagerID|NumEmployees|
+---------+------------+
|       1 |           3|
|        1|           1|
|         |           1|
+---------+------------+

+-----+--------+
|EmpID|Absences|
+-----+--------+
+-----+--------+



**Joins**

In [88]:
#1.Attendance %
attendance_summary = attendance.groupBy("EmpID") \
    .agg(count("*").alias("TotalDays"),
         sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")) \
    .withColumn("AttendancePercent", round(col("PresentDays") / col("TotalDays") * 100, 2))
attendance_summary.show()
#2.Join with bonus for Top 3 TotalCompensation
comp_df = employees.join(bonuses, "EmpID").withColumn("TotalCompensation", col("Salary") + col("Bonus"))
comp_df.orderBy(desc("TotalCompensation")).select("EmpID", "Name", "TotalCompensation").show(3)
#3.Multi-level join
employees.join(bonuses, "EmpID").join(attendance_summary, "EmpID").show()


+-----+---------+-----------+-----------------+
|EmpID|TotalDays|PresentDays|AttendancePercent|
+-----+---------+-----------+-----------------+
|    1|        2|          0|              0.0|
|    3|        2|          0|              0.0|
|    5|        2|          1|             50.0|
|    4|        2|          0|              0.0|
|    2|        2|          0|              0.0|
+-----+---------+-----------+-----------------+

+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
+-----+------+-----------------+
only showing top 3 rows

+-----+------+-----------+----------+------+---------+-----------+-----+----+---------+-----------+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalDays|PresentDays|AttendancePercent|
+-----+------+-----------+----------+------+---------+-----------+-----+---

**String, Date Functions**

In [89]:
#Extract year and month from JoinDate
employees = employees.withColumn("JoinYear", year("JoinDate")).withColumn("JoinMonth", month("JoinDate"))
employees.show()
# Mask employee names using regex
employees = employees.withColumn("MaskedName", regexp_replace("Name", "(.).+", "$1***"))
employees.show()
#Use substring() to create EmpCode like "EMP001".
employees = employees.withColumn("EmpCode", concat(lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))
employees.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|    1| Anita|         HR|2021-05-01| 55000|         |       4.11|    2021|        5|
|    2|   Raj|Engineering|2020-03-15| 80000|       1 |       5.24|    2020|        3|
|    3|Simran|Engineering|2022-07-10| 75000|       1 |       2.92|    2022|        7|
|    4| Aamir|  Marketing|2019-11-20| 60000|       1 |       5.56|    2019|       11|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|
+-----+------+-----------+-----

**Conditional & Null Handling**

In [90]:
#Use when/otherwise to label performance
bonuses = bonuses.withColumn("PerformanceLabel",
    when(col("Bonus") > 6000, "High")
    .when(col("Bonus").between(4000, 6000), "Medium")
    .otherwise("Low"))
# Handle missing ManagerID using fillna("No Manager")
employees = employees.withColumn("JoinDate", to_date("JoinDate"))
# Rename the ManagerID column to remove the trailing space after reloading
employees = employees.withColumnRenamed("ManagerID ", "ManagerID")
# Fix null ManagerID using the correct column name
employees = employees.fillna({"ManagerID": "No Manager"})
employees.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|         |       4.11|    2021|        5|      A***| EMP001|
|    2|   Raj|Engineering|2020-03-15| 80000|       1 |       5.24|    2020|        3|      R***| EMP002|
|    3|Simran|Engineering|2022-07-10| 75000|       1 |       2.92|    2022|        7|      S***| EMP003|
|    4| Aamir|  Marketing|2019-11-20| 60000|       1 |       5.56|    2019|       11|      A***| EMP004|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1|      N***| EMP005|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



**Spark SQL**

In [91]:
#create and use database hr
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.catalog.setCurrentDatabase("hr")
# Save all DataFrames as tables: employees, attendance, bonuses
attendance.write.mode("overwrite").saveAsTable("hr.attendance")
bonuses.write.mode("overwrite").saveAsTable("hr.bonuses")
employees.write.mode("overwrite").saveAsTable("hr.employees")
# Queries
# Top paid employee in each department
spark.sql("""
SELECT e.Department, e.Name, e.Salary
FROM hr.employees e
JOIN (
    SELECT Department, MAX(Salary) AS MaxSal
    FROM hr.employees
    GROUP BY Department
) x
ON e.Department = x.Department AND e.Salary = x.MaxSal
""").show()
# Attendance rate by department
spark.sql("""
SELECT e.Department, ROUND(SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END)/COUNT(*) * 100, 2) AS AttendanceRate
FROM hr.employees e
JOIN hr.attendance a ON e.EmpID = a.EmpID
GROUP BY e.Department
""").show()
#Employees joined after 2021 with salary > 70,000
spark.sql("SELECT * FROM hr.employees WHERE JoinDate > '2021-01-01' AND Salary > 70000").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|           0.0|
|         HR|          25.0|
|  Marketing|           0.0|
+-----------+--------------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    3|Simran|Engineering|2022-07-10| 75000|       1 |       2.92|    2022|        7|      S***| EMP003|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



**Advanced**

In [92]:
# UDF: classify department
def classify_dept(dept):
    return "Tech" if dept in ["Engineering", "IT"] else "Non-Tech"
from pyspark.sql.types import StringType
classify_udf = udf(classify_dept, StringType())
employees = employees.withColumn("DeptType", classify_udf(col("Department")))
# Create view & save
emp_attendance_summary = employees.join(attendance_summary, "EmpID")
emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")
# Save as Parquet partitioned by department
emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet("/content/emp_attendance_summary.parquet")
