In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Aggregation and Grouping") \
                    .getOrCreate()
spark

## Dataset 1: Employee Data

In [0]:
data = [ 
        ("Ananya", "HR", 52000), 
        ("Rahul", "Engineering", 65000), 
        ("Priya", "Engineering", 60000), 
        ("Zoya", "Marketing", 48000), 
        ("Karan", "HR", 53000), 
        ("Naveen", "Engineering", 70000), 
        ("Fatima", "Marketing", 45000) 
        ] 
columns = ["Name", "Department", "Salary"]
employee_df = spark.createDataFrame(data, columns)

## Dataset 2: Performance Data

In [0]:
performance =[ 
                ("Ananya", 2023, 4.5), 
                ("Rahul", 2023, 4.9), 
                ("Priya", 2023, 4.3), 
                ("Zoya", 2023, 3.8), 
                ("Karan", 2023, 4.1), 
                ("Naveen", 2023, 4.7), 
                ("Fatima", 2023, 3.9) 
            ] 
columns_perf = ["Name", "Year", "Rating"] 
performance_df = spark.createDataFrame(performance, columns_perf)

## PySpark Exercises – Set 2 (Advanced)
### GroupBy and Aggregations

1. Get the average salary by department.

In [0]:
from pyspark.sql.functions import avg
employee_df.groupBy(employee_df.Department) \
           .agg(avg(employee_df.Salary).alias("Average Salary")) \
           .show()

+-----------+--------------+
| Department|Average Salary|
+-----------+--------------+
|         HR|       52500.0|
|Engineering|       65000.0|
|  Marketing|       46500.0|
+-----------+--------------+



2. Count of employees per department.

In [0]:
employee_df.groupBy(employee_df.Department) \
           .count() \
           .withColumnRenamed("count", "Number of Employees") \
           .show()

+-----------+-------------------+
| Department|Number of Employees|
+-----------+-------------------+
|         HR|                  2|
|Engineering|                  3|
|  Marketing|                  2|
+-----------+-------------------+



3. Maximum and minimum salary in Engineering.

In [0]:
from pyspark.sql.functions import min, max
employee_df.groupBy(employee_df.Department) \
           .agg(min(employee_df.Salary).alias("Minimum Salary"), 
                max(employee_df.Salary).alias("Maximum Salary")) \
           .show()

+-----------+--------------+--------------+
| Department|Minimum Salary|Maximum Salary|
+-----------+--------------+--------------+
|         HR|         52000|         53000|
|Engineering|         60000|         70000|
|  Marketing|         45000|         48000|
+-----------+--------------+--------------+



### Join and Combine Data

4.  Perform an inner join between employee_data and performance_data on Name .

In [0]:
employee_performance_df = employee_df.join(performance_df, on="Name", how="inner")
employee_performance_df.show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+



5. Show each employee’s salary and performance rating.

In [0]:
employee_performance_df.select("Name", "Salary", "Rating").show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+



6. Filter employees with rating > 4.5 and salary > 60000.

In [0]:
employee_performance_df.filter((employee_performance_df.Rating > 4.5) & (employee_performance_df.Salary > 60000)) \
                       .show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



###  Window & Rank

7. Rank employees by salary department-wise

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

dept_df = Window.partitionBy("Department") \
                .orderBy(desc("Salary"))
employee_df = employee_df.withColumn("Rank", rank().over(dept_df))
employee_df.show()

+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+



8. Calculate cumulative salary in each department.

In [0]:
from pyspark.sql.functions import sum
from pyspark.sql.window import Window

dept_df = Window.partitionBy("Department") \
                    .orderBy("Salary") \
                    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

cumm_df = employee_df.withColumn("Cumulative_Salary", sum("Salary").over(dept_df))

cumm_df.select("Name", "Department", "Salary", "Cumulative_Salary").show()


+------+-----------+------+-----------------+
|  Name| Department|Salary|Cumulative_Salary|
+------+-----------+------+-----------------+
| Priya|Engineering| 60000|            60000|
| Rahul|Engineering| 65000|           125000|
|Naveen|Engineering| 70000|           195000|
|Ananya|         HR| 52000|            52000|
| Karan|         HR| 53000|           105000|
|Fatima|  Marketing| 45000|            45000|
|  Zoya|  Marketing| 48000|            93000|
+------+-----------+------+-----------------+



###Date Operations

9.Add a new column JoinDate with random dates between 2020 and 2023.

In [0]:
import random
from datetime import datetime, timedelta
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
def random_date():
    start = datetime(2020, 1, 1)
    end = datetime(2023, 12, 31)
    return start + timedelta(days=random.randint(0, (end - start).days))

random_date_udf = udf(random_date, DateType())
employee_df = employee_df.withColumn("JoinDate", random_date_udf())
employee_df.show()

+------+-----------+------+----+----------+
|  Name| Department|Salary|Rank|  JoinDate|
+------+-----------+------+----+----------+
|Naveen|Engineering| 70000|   1|2023-06-14|
| Rahul|Engineering| 65000|   2|2021-01-13|
| Priya|Engineering| 60000|   3|2021-03-01|
| Karan|         HR| 53000|   1|2023-03-30|
|Ananya|         HR| 52000|   2|2020-08-11|
|  Zoya|  Marketing| 48000|   1|2020-01-15|
|Fatima|  Marketing| 45000|   2|2023-03-02|
+------+-----------+------+----+----------+



10. Add column YearsWithCompany using current_date() and datediff() 

In [0]:
from pyspark.sql.functions import current_date, datediff, floor

employee_df = employee_df.withColumn("YearsWithCompany",
                                    floor(datediff(current_date(), employee_df.JoinDate) / 365)
                                    )

employee_df.show()


+------+-----------+------+----+----------+----------------+
|  Name| Department|Salary|Rank|  JoinDate|YearsWithCompany|
+------+-----------+------+----+----------+----------------+
|Naveen|Engineering| 70000|   1|2023-01-14|               2|
| Rahul|Engineering| 65000|   2|2020-04-18|               5|
| Priya|Engineering| 60000|   3|2021-10-20|               3|
| Karan|         HR| 53000|   1|2021-02-17|               4|
|Ananya|         HR| 52000|   2|2023-03-05|               2|
|  Zoya|  Marketing| 48000|   1|2020-03-13|               5|
|Fatima|  Marketing| 45000|   2|2023-06-27|               1|
+------+-----------+------+----+----------+----------------+



### Writing to Files

11. Write the full employee DataFrame to CSV with headers.


In [0]:
employee_df.write.mode("overwrite").option("header", True).csv("dbfs:/FileStore/performance_csv")

12. Save the joined DataFrame to a Parquet file

In [0]:
employee_performance_df.write.mode("overwrite").parquet("dbfs:/FileStore/joined_data_parquet")

Displaying  csv files

In [0]:
%fs ls dbfs:/FileStore/

path,name,size,modificationTime
dbfs:/FileStore/employee_data_csv/,employee_data_csv/,0,1749632743000
dbfs:/FileStore/joined_data_parquet/,joined_data_parquet/,0,1749632828000
dbfs:/FileStore/performance_csv/,performance_csv/,0,1749633311000
