In [1]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
      .appName("Colab PySpark Setup")\
      .getOrCreate()

spark

In [8]:
import os

file_path = '/content/drive/MyDrive/PysparkDemo/large_employee_dataset.csv'
print(os.path.exists(file_path))

True


In [22]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

from pyspark.sql.functions import avg, max, sum, count, year, current_date, datediff, col


emp_df=spark.read.csv('/content/drive/MyDrive/PysparkDemo/large_employee_dataset.csv',header=True,inferSchema=True)
emp_df.show()

# 1.Show the top 10 rows of the dataset.
emp_df.show(10)

# 2. Count the total number of employees.
emp_df.count()

# 3. Display unique departments.
emp_df.select("Department").distinct().show()

#  4. Filter all employees in the "IT" department.
emp_df.filter(emp_df.Department=="IT").show()

#  5. Show employees aged between 30 and 40.
emp_df.filter((emp_df.Age>=30) & (emp_df.Age<=40)).show()

#  6. Sort employees by Salary in descending order.
emp_df.orderBy(emp_df.Salary.desc()).show()

#  7. Get the average salary by department.
emp_df.groupBy("Department").agg(avg("Salary").alias("Average Salary")).show()

#  8. Count of employees by Status.
emp_df.groupBy("Status").count().show()

#  9. Highest salary in each city.
emp_df.groupBy("City").agg(max("Salary").alias("Max Salary")).show()

#  10. Total number of employees who joined each year.
emp_df.withColumn("Year", year("JoiningDate")).groupBy("Year").count().orderBy("Year").show()

#  11. Department-wise count of employees who are currently "Active".
emp_df.filter(emp_df.Status=="Active").groupBy("Department").count().show()

#  12. Average age of employees per department.
emp_df.groupBy("Department").agg(avg("Age").alias("Average Age")).show()
#  13. Create another dataset with City and Region, and join it.

region_data = [("New York", "East"), ("Los Angeles", "West"), ("Chicago", "Midwest")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])

joined_df = emp_df.join(region_df, on="City", how="left")
joined_df.show()

#  14. Group salaries by Region after the join.
joined_df.groupBy("Region").agg(sum("Salary").alias("Total Salary")).show()

#  15. Calculate years of experience for each employee (current date - JoiningDate).
emp_df = emp_df.withColumn("Experience_Years", (datediff(current_date(), col("JoiningDate")) / 365).cast("int"))
emp_df.select("EmployeeID", "Name", "JoiningDate", "Experience_Years").show()


#  16. List all employees with more than 5 years of experience
emp_df.filter(col("Experience_Years") > 5).select("EmployeeID", "Name", "Experience_Years").show()



Mounted at /content/drive
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|        East Robert|
|   