**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Agg-Grouping")\
        .getOrCreate()
spark

**Create the DataFrames**

In [0]:
# Employee Data 
data = [ 
    ("Ananya", "HR", 52000), 
    ("Rahul", "Engineering", 65000), 
    ("Priya", "Engineering", 60000), 
    ("Zoya", "Marketing", 48000), 
    ("Karan", "HR", 53000), 
    ("Naveen", "Engineering", 70000), 
    ("Fatima", "Marketing", 45000) 
]
columns = ["Name", "Department", "Salary"]
emp = spark.createDataFrame(data, columns)
emp.show()
# Performance Data
performance = [ 
    ("Ananya", 2023, 4.5), 
    ("Rahul", 2023, 4.9), 
    ("Priya", 2023, 4.3), 
    ("Zoya", 2023, 3.8), 
    ("Karan", 2023, 4.1), 
    ("Naveen", 2023, 4.7), 
    ("Fatima", 2023, 3.9) 
]
columns_perf = ["Name", "Year", "Rating"]
perf = spark.createDataFrame(performance, columns_perf)
perf.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



**GroupBy and Aggregations**

In [0]:
#1.Get the average salary by department
from pyspark.sql.functions import avg, count, max, min
print("Average Salary by Department")
avg_salary = emp.groupBy("Department").agg(avg("Salary").alias("Average Salary"))
avg_salary.show()
#2.Count of employees per department
print("Count of Employees per Department")
count_employees = emp.groupBy("Department").agg(count("Name").alias("Number of Employees"))
count_employees.show()
#3.Maximum and minimum salary in Engineering
print("Maximum and Minimum Salary in Engineering")
max_min_salary = emp.filter(emp.Department == "Engineering").agg(max("Salary").alias("Maximum Salary"), min("Salary").alias("Minimum Salary"))
max_min_salary.show()

Average Salary by Department
+-----------+--------------+
| Department|Average Salary|
+-----------+--------------+
|         HR|       52500.0|
|Engineering|       65000.0|
|  Marketing|       46500.0|
+-----------+--------------+

Count of Employees per Department
+-----------+-------------------+
| Department|Number of Employees|
+-----------+-------------------+
|         HR|                  2|
|Engineering|                  3|
|  Marketing|                  2|
+-----------+-------------------+

Maximum and Minimum Salary in Engineering
+--------------+--------------+
|Maximum Salary|Minimum Salary|
+--------------+--------------+
|         70000|         60000|
+--------------+--------------+



**Joins and Filtering**

In [0]:
#4.Perform an inner join between employee_data and performance_data on Name 
print("Inner Join between Employee and Performance Data")
joined= emp.join(perf, on="Name", how="inner")
joined.show()
#5.Show each employee’s salary and performance rating
print("Employee Salary and Performance Rating")
joined.select("Name", "Salary", "Rating").show()
#6.Filter employees with rating > 4.5 and salary > 60000
print("Employees with Rating > 4.5 and Salary > 60000")
filtered = joined.filter((joined.Rating > 4.5) & (joined.Salary > 60000))
filtered.show()


Inner Join between Employee and Performance Data
+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+

Employee Salary and Performance Rating
+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+

Employees with Rating > 4.5 and Salary > 60000
+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|En

**Window & Rank**

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum
#7.Rank employees by salary within each department
print("Rank Employees by Salary within Each Department")
window = Window.partitionBy("Department").orderBy("Salary")
ranked = emp.withColumn("Rank", rank().over(window))
ranked.show()
#8.Calculate cumulative salary in each department
print("Cumulative Salary in Each Department")
window = Window.partitionBy("Department").orderBy("Salary")
cumulative_salary = emp.withColumn("Cumulative Salary", _sum("Salary").over(window))
cumulative_salary.show()

Rank Employees by Salary within Each Department
+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
| Priya|Engineering| 60000|   1|
| Rahul|Engineering| 65000|   2|
|Naveen|Engineering| 70000|   3|
|Ananya|         HR| 52000|   1|
| Karan|         HR| 53000|   2|
|Fatima|  Marketing| 45000|   1|
|  Zoya|  Marketing| 48000|   2|
+------+-----------+------+----+

Cumulative Salary in Each Department
+------+-----------+------+-----------------+
|  Name| Department|Salary|Cumulative Salary|
+------+-----------+------+-----------------+
| Priya|Engineering| 60000|            60000|
| Rahul|Engineering| 65000|           125000|
|Naveen|Engineering| 70000|           195000|
|Ananya|         HR| 52000|            52000|
| Karan|         HR| 53000|           105000|
|Fatima|  Marketing| 45000|            45000|
|  Zoya|  Marketing| 48000|            93000|
+------+-----------+------+-----------------+



**Date Operations**

In [0]:
import random
from datetime import date, timedelta
from pyspark.sql.functions import to_date, col, current_date, datediff
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
data = [ 
    ("Ananya", "HR", 52000), 
    ("Rahul", "Engineering", 65000), 
    ("Priya", "Engineering", 60000), 
    ("Zoya", "Marketing", 48000), 
    ("Karan", "HR", 53000), 
    ("Naveen", "Engineering", 70000), 
    ("Fatima", "Marketing", 45000) 
]
columns = ["Name", "Department", "Salary"]
emp = spark.createDataFrame(data, columns)
#9.Generate random JoinDate between 2020-2023
def random_date():
    start = date(2020, 1, 1)
    end = date(2023, 12, 31)
    delta = end - start
    return str(start + timedelta(days=random.randint(0, delta.days)))

# Create (Name, JoinDate) pairs
join_date_data = [(name, random_date()) for name, _, _ in data]

# Create JoinDate DataFrame
join_dates_df = spark.createDataFrame(join_date_data, ["Name", "JoinDateStr"]) \
    .withColumn("JoinDate", to_date(col("JoinDateStr"), "yyyy-MM-dd")) \
    .drop("JoinDateStr")
# Join employee data with join dates
emp_with_dates = emp.join(join_dates_df, on="Name", how="inner")
emp_with_dates = emp_with_dates.select("Name", "Department", "Salary", "JoinDate")
display(emp_with_dates)
#10Add YearsWithCompany column 
emp_with_dates = emp_with_dates.withColumn(
    "YearsWithCompany",
    (datediff(current_date(), col("JoinDate")) / 365).cast("int")
)
display(emp_with_dates)

Name,Department,Salary,JoinDate
Ananya,HR,52000,2020-05-10
Fatima,Marketing,45000,2020-11-09
Karan,HR,53000,2022-02-04
Naveen,Engineering,70000,2021-02-02
Priya,Engineering,60000,2023-08-31
Rahul,Engineering,65000,2020-07-02
Zoya,Marketing,48000,2022-01-28


Name,Department,Salary,JoinDate,YearsWithCompany
Ananya,HR,52000,2020-05-10,5
Fatima,Marketing,45000,2020-11-09,4
Karan,HR,53000,2022-02-04,3
Naveen,Engineering,70000,2021-02-02,4
Priya,Engineering,60000,2023-08-31,1
Rahul,Engineering,65000,2020-07-02,4
Zoya,Marketing,48000,2022-01-28,3


**save the data**

In [0]:
emp_with_dates.write.csv("/FileStore/employee_data", header=True, mode="overwrite")
joined.write.parquet("/FileStore/employee_perf", mode="overwrite")

In [0]:
display(spark.read.csv("/FileStore/employee_data", header=True))
display(spark.read.parquet("/FileStore/employee_perf"))

Name,Department,Salary,JoinDate,YearsWithCompany
Ananya,HR,52000,2021-09-26,3
Fatima,Marketing,45000,2020-06-01,5
Karan,HR,53000,2020-07-22,4
Naveen,Engineering,70000,2022-02-08,3
Priya,Engineering,60000,2020-06-25,4
Rahul,Engineering,65000,2022-02-24,3
Zoya,Marketing,48000,2020-12-24,4


Name,Department,Salary,Year,Rating
Ananya,HR,52000,2023,4.5
Fatima,Marketing,45000,2023,3.9
Karan,HR,53000,2023,4.1
Naveen,Engineering,70000,2023,4.7
Priya,Engineering,60000,2023,4.3
Rahul,Engineering,65000,2023,4.9
Zoya,Marketing,48000,2023,3.8
