In [48]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Employee Dataset Exploration").getOrCreate()
spark

In [49]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# Load CSV into Spark DataFrame
df = spark.read.csv("/content/drive/MyDrive/large_employee_dataset.csv", header=True, inferSchema=True)

In [51]:
#first 10 rows
df.show(10)

+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [52]:
#count total number of employees
df.count()

500

In [53]:
#unique departments
df.select("Department").distinct().show()

+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



In [54]:
#Filter all employees in the "IT" department.
df.filter(df.Department == "IT").show()

+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [55]:
#Show employees aged between 30 and 40.
df.filter((df.Age >= 30) & (df.Age <= 40)).show()

+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [56]:
# Sort employees by Salary in descending order.
df.orderBy(df.Salary.desc()).show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

In [57]:
# Get the average salary by department.
df.groupBy("Department").avg("Salary").withColumnRenamed("avg(Salary)", "AverageSalary").show()

+----------+-----------------+
|Department|    AverageSalary|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [58]:
# Count of employees by Status.
df.groupBy("Status").count().show()

+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+



In [59]:
# Highest salary in each city.
from pyspark.sql.functions import max
df.groupBy("City").agg(max("Salary").alias("MaxSalary")).show()

+----------------+---------+
|            City|MaxSalary|
+----------------+---------+
|   Wilsonchester|    67025|
|     Bradshawton|   111116|
|       Steventon|    32009|
|     Lake Alyssa|    84903|
|      North Lisa|    57898|
|    North Marvin|    66252|
|     Jenniferton|    39907|
|     Buckleyview|    50109|
|     Burtonville|    98492|
|    Johnsonmouth|    48799|
|    South Joseph|    52456|
|  Lindseychester|    90340|
|   North Stephen|    91947|
|Port Nicoleshire|    57537|
|    Jerrychester|    53374|
|  North Jennifer|    82486|
|      Laurenstad|    44608|
|West Brendanbury|    90698|
|       Juliaberg|    50170|
|       New James|    54378|
+----------------+---------+
only showing top 20 rows



In [60]:
#Total number of employees who joined each year.
from pyspark.sql.functions import year
df.withColumn("JoinYear", year("JoiningDate")).groupBy("JoinYear").count().show()

+--------+-----+
|JoinYear|count|
+--------+-----+
|    2025|   27|
|    2018|   52|
|    2015|   37|
|    2023|   47|
|    2022|   49|
|    2019|   52|
|    2020|   56|
|    2016|   49|
|    2024|   38|
|    2017|   44|
|    2021|   49|
+--------+-----+



In [61]:
# Department-wise count of employees who are currently "Active".
df.filter(df.Status == "Active").groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



In [63]:
# Average age of employees per department.
df.groupBy("Department").avg("Age").withColumnRenamed("avg(Age)", "Average Age").show()

+----------+------------------+
|Department|       Average Age|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



In [64]:
#Create another dataset with City and Region , and join it.
from pyspark.sql.functions import when, col

df = df.withColumn("Region",
    when(col("City").startswith("North"), "North")
    .when(col("City").startswith("South"), "South")
    .when(col("City").startswith("East"), "East")
    .when(col("City").startswith("West"), "West")
    .otherwise("Unknown")
)
df.select("City", "Region").distinct().show(100, truncate=False)

+---------------------+-------+
|City                 |Region |
+---------------------+-------+
|Jennifermouth        |Unknown|
|South Roger          |South  |
|Caseyville           |Unknown|
|South Luis           |South  |
|Lake Rebeccafort     |Unknown|
|South Jayport        |South  |
|New Jeff             |Unknown|
|Stefanietown         |Unknown|
|West Johnport        |West   |
|Williamsborough      |Unknown|
|Graytown             |Unknown|
|Port Stephaniemouth  |Unknown|
|Patelton             |Unknown|
|Lake Kimberlyside    |Unknown|
|Lake William         |Unknown|
|Larsenside           |Unknown|
|West Monicaport      |West   |
|Alvaradoton          |Unknown|
|Wattsshire           |Unknown|
|Tonyamouth           |Unknown|
|East James           |East   |
|Floydmouth           |Unknown|
|North Christopherstad|North  |
|Melissatown          |Unknown|
|South Joseph         |South  |
|East Jamesport       |East   |
|South Matthewhaven   |South  |
|Port Terrimouth      |Unknown|
|Michell

In [65]:
# Group salaries by Region after the join.
df.groupBy("Region").avg("Salary").show()

+-------+-----------------+
| Region|      avg(Salary)|
+-------+-----------------+
|Unknown|74605.20170454546|
|  South|71426.63829787234|
|   East|       73085.8125|
|   West|73189.75757575757|
|  North|77350.58333333333|
+-------+-----------------+



In [66]:
# Calculate years of experience for each employee (current date - JoiningDate).
from pyspark.sql.functions import current_date, datediff, round
df = df.withColumn("Experience_Years",
    round(datediff(current_date(), col("JoiningDate")) / 365.0,2))
df.select("Name", "JoiningDate", "Experience_Years").show(10, truncate=False)

+---------------------+-----------+----------------+
|Name                 |JoiningDate|Experience_Years|
+---------------------+-----------+----------------+
|Charles Johnson      |2018-07-07 |6.91            |
|Dylan Camacho        |2015-08-25 |9.78            |
|Mr. Ryan Bowman Jr.  |2025-03-11 |0.23            |
|Brian Ball           |2015-10-01 |9.68            |
|Angela Hooper        |2019-08-14 |5.81            |
|Alexander Johnson PhD|2016-04-21 |9.12            |
|Steven Lane          |2021-07-25 |3.86            |
|Riley Johnson        |2015-08-03 |9.84            |
|Emily Washington     |2021-11-30 |3.51            |
|Valerie Fleming      |2019-12-08 |5.49            |
+---------------------+-----------+----------------+
only showing top 10 rows



In [67]:
# Filter employees with more than 5 years of experience
df.filter(col("Experience_Years") > 5).select("Name", "JoiningDate", "Experience_Years").show(truncate=False)

+---------------------+-----------+----------------+
|Name                 |JoiningDate|Experience_Years|
+---------------------+-----------+----------------+
|Charles Johnson      |2018-07-07 |6.91            |
|Dylan Camacho        |2015-08-25 |9.78            |
|Brian Ball           |2015-10-01 |9.68            |
|Angela Hooper        |2019-08-14 |5.81            |
|Alexander Johnson PhD|2016-04-21 |9.12            |
|Riley Johnson        |2015-08-03 |9.84            |
|Valerie Fleming      |2019-12-08 |5.49            |
|Tracy Hughes MD      |2020-06-01 |5.01            |
|Scott Burnett        |2016-04-25 |9.11            |
|Brittany Kerr        |2019-03-24 |6.2             |
|Edwin Burns          |2015-09-14 |9.73            |
|Mary Reynolds        |2018-07-02 |6.93            |
|Erin Berg            |2018-04-27 |7.11            |
|Jason Hines          |2015-11-30 |9.52            |
|Christopher Mcdaniel |2015-06-03 |10.01           |
|April Allen          |2020-05-20 |5.04       