In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, round, max
from google.colab import drive # Mount Google Drive

# Creating new session
spark = SparkSession.builder \
                    .appName("Colab PySpark Setup Employee data set") \
                    .getOrCreate()
spark

In [4]:
drive.mount('/content/drive')

# Reading CSV file from drive
employee_df = spark.read.csv('/content/drive/MyDrive/SalesData/large_employee_dataset.csv', header=True, inferSchema=True)

Mounted at /content/drive
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|        East Robert|
|   

# Exercises
 Basic Exploration

In [63]:
# 1. Show the top 10 rows of the dataset.
print("\n Displaying Top 10 records")
employee_df.show(10)

# 2. Count the total number of employees.
print("\n Displaying Total Number of Employees")
print(employee_df.count())

# 3. Display unique departments.
print("\n Displaying Unique Departments")
employee_df.select("Department").distinct().show()


 Displaying Top 10 records
+----------+--------------------+---+----------+------+-----------+--------+------------+-----------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|YearsOfExperience|
+----------+--------------------+---+----------+------+-----------+--------+------------+-----------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|                6|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|                9|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|                0|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|                9|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|                5|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|                9|


Filtering & Sorting

In [28]:
# 4.Filter all employees in the "IT" department.
print("\n Displaying Employees of IT department")
employee_df.filter(employee_df["Department"] == "IT") \
           .select("EmployeeID", "Name", "JoiningDate", "Status") \
           .show()

# 5. Show employees aged between 30 and 40.
print("\n Displaying Employees aged between 30 and 40")
employee_df.filter((employee_df["Age"] >= 30) &( employee_df["Age"] <= 40)) \
          .select("EmployeeID", "Name", "Age", "JoiningDate", "Status") \
          .show()

# 6. Sort employees by Salary in descending order.
print("\n Displaying Employees sorted by Salary in descending order")
employee_df.orderBy(employee_df["Salary"].desc()) \
           .select("EmployeeID", "Name", "Salary", "JoiningDate", "Status") \
           .show()





 Displaying Employees of IT department
+----------+-------------------+-----------+--------+
|EmployeeID|               Name|JoiningDate|  Status|
+----------+-------------------+-----------+--------+
|      6598|        Mary Henson| 2021-08-25|  Active|
|      8518|   Elizabeth Abbott| 2022-11-05|  Active|
|      9506|        Thomas Dunn| 2020-07-12|On Leave|
|      9663|        Glenn Mason| 2020-03-27|On Leave|
|      2106|     Richard Bailey| 2021-06-29|Resigned|
|      8212|      Jacob Jackson| 2020-09-18|On Leave|
|      6354|     Nicole Gilmore| 2018-05-04|  Active|
|      5716|         David Wang| 2016-03-12|Resigned|
|      1181|       Joseph Clark| 2016-05-31|On Leave|
|      8302|      Debra Swanson| 2023-10-25|Resigned|
|      9542|      Jeffrey Frank| 2024-01-28|On Leave|
|      4100|Christopher Jimenez| 2017-03-13|  Active|
|      5780|     Kevin Harrison| 2024-03-28|  Active|
|      5572|    Andrew Harrison| 2021-12-08|  Active|
|      6388|        Melissa Lee| 2016-05-1

Aggregation Tasks

In [38]:
# 7. Get the average salary by department
print("\n Displaying Average Salary by Department")
employee_df.groupBy("Department") \
           .agg(round(avg("Salary"), 2) \
           .alias("AverageSalary")) \
           .select("Department", "AverageSalary") \
           .show()

# 8. Count of employees by Status.
print("\n Displaying Count of Employees by Status")
employee_df.groupBy(["Status"]).count().show()

# 9. Highest salary in each city.
print("\n Displaying Highest salary in each city. ")
employee_df.groupBy(["City"]).agg(max("Salary") \
                             .alias("Maximum Salary")).show()




 Displaying Average Salary by Department
+----------+-------------+
|Department|AverageSalary|
+----------+-------------+
|     Sales|     77488.55|
|        HR|     76091.27|
|   Finance|     72834.76|
| Marketing|     71958.19|
|        IT|     73116.26|
+----------+-------------+


 Displaying Count of Employees by Status
+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+


 Displaying Highest salary in each city. 
+----------------+--------------+
|            City|Maximum Salary|
+----------------+--------------+
|   Wilsonchester|         67025|
|     Bradshawton|        111116|
|       Steventon|         32009|
|     Lake Alyssa|         84903|
|      North Lisa|         57898|
|    North Marvin|         66252|
|     Jenniferton|         39907|
|     Buckleyview|         50109|
|     Burtonville|         98492|
|    Johnsonmouth|         48799|
|    South Joseph|         52456|
|  Lindseychester|         90340|

GroupBy and Analysis

In [51]:
from pyspark.sql.functions import year

# 10. Total number of employees who joined each year.
print("\n Displaying Total number of employees who joined each year.")
employee_df.withColumn("JoiningYear", year("JoiningDate")) \
           .groupBy("JoiningYear") \
           .count() \
           .withColumnRenamed("count", "TotalEmployees") \
           .show()

# 11. Department-wise count of employees who are currently "Active".
print("\n Displaying Department-wise count of employees who are currently Active")
employee_df.filter(employee_df["Status"] == "Active") \
          .groupBy("Department") \
          .count() \
          .withColumnRenamed("count", "ActiveEmployees") \
          .show()

# 12. Average age of employees per department.
print("\n Displaying Average age of employees per department")
employee_df.groupBy("Department") \
           .agg(round(avg("Age"), 2) \
           .alias("Average Age")) \
           .show()




 Displaying Total number of employees who joined each year.
+-----------+--------------+
|JoiningYear|TotalEmployees|
+-----------+--------------+
|       2025|            27|
|       2018|            52|
|       2015|            37|
|       2023|            47|
|       2022|            49|
|       2019|            52|
|       2020|            56|
|       2016|            49|
|       2024|            38|
|       2017|            44|
|       2021|            49|
+-----------+--------------+


 Displaying Department-wise count of employees who are currently Active
+----------+---------------+
|Department|ActiveEmployees|
+----------+---------------+
|     Sales|             32|
|        HR|             37|
|   Finance|             45|
| Marketing|             32|
|        IT|             26|
+----------+---------------+


 Displaying Average age of employees per department
+----------+-----------+
|Department|Average Age|
+----------+-----------+
|     Sales|      40.54|
|        HR|   

Joining (Use another DataFrame for mapping)

In [56]:
# 13. Create another dataset with City and Region, and join it.
region_df = spark.read.csv('/content/drive/MyDrive/SalesData/Regions.csv', header=True, inferSchema=True)
print("\n Displaying Region dataset")
region_df.show()

#Joining region_df with employee_df
joined_df = employee_df.join(region_df, on="City", how="left")

print("\nDisplaying the dataset after join:")
joined_df.select(employee_df["*"], region_df["Region"]).show()

# 14. Group salaries by Region after the join.
print("\nDisplaying Salaries grouped by Region:")
joined_df.groupBy("Region") \
         .agg(round(avg("Salary"), 2).alias("AverageSalary")) \
         .orderBy("Region") \
         .show()






 Displaying Region dataset
+-------------------+------+
|               City|Region|
+-------------------+------+
|          Allentown|  East|
|        Anthonyfort|  West|
|          Gilesstad| North|
|       Jenniferfurt| South|
|       Lake Amystad|  West|
|         Russohaven| North|
|        East Robert|  East|
|         New Thomas| South|
|        West Ashley|  West|
|        Caseborough| North|
|       West Phillip|  West|
|           Jillport|  East|
|          Port Mark| South|
|     North Brittany| North|
|   South Donnaville| South|
|        Port Tricia|  West|
|North Justinborough| North|
|       Mcgeechester|  East|
|         Tonyamouth|  West|
|     Jimenezborough| South|
+-------------------+------+
only showing top 20 rows


Displaying the dataset after join:
+----------+--------------------+---+----------+------+-----------+--------+-------------------+------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|Region|
+-----

Date Operation

In [61]:
from pyspark.sql.functions import current_date, months_between, col, floor

# 15. Calculate years of experience for each employee (current date - JoiningDate)

employee_df = employee_df.withColumn(
    "YearsOfExperience",
    floor(months_between(current_date(), col("JoiningDate")) / 12)
)

print("\n Displaying years of experience for each employee")
employee_df.select("EmployeeID", "Name", "JoiningDate", "YearsOfExperience").show()

# 16. List all employees with more than 5 years of experience
print("\n Displaying employees with more than 5 years of experience")
employee_df.filter(employee_df["YearsOfExperience"] > 5) \
           .select("EmployeeID", "Name", "YearsOfExperience") \
           .show()


 Displaying years of experience for each employee
+----------+--------------------+-----------+-----------------+
|EmployeeID|                Name|JoiningDate|YearsOfExperience|
+----------+--------------------+-----------+-----------------+
|      4128|     Charles Johnson| 2018-07-07|                6|
|      6094|       Dylan Camacho| 2015-08-25|                9|
|      5883| Mr. Ryan Bowman Jr.| 2025-03-11|                0|
|      9146|          Brian Ball| 2015-10-01|                9|
|      1918|       Angela Hooper| 2019-08-14|                5|
|      4600|Alexander Johnson...| 2016-04-21|                9|
|      6253|         Steven Lane| 2021-07-25|                3|
|      8278|       Riley Johnson| 2015-08-03|                9|
|      8520|    Emily Washington| 2021-11-30|                3|
|      1298|     Valerie Fleming| 2019-12-08|                5|
|      5157|     Tracy Hughes MD| 2020-06-01|                5|
|      7403|    Johnathan Harmon| 2021-03-09|        