In [58]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("AggFunc").getOrCreate()

In [3]:
spark

In [6]:
df = spark.read.csv("AggFunc.csv",header=True,inferSchema=True)

In [9]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



### Aggregate functions always return a single value

In [8]:
df.show()

+-----+------------+------+
| Name| Departments|salary|
+-----+------------+------+
|Basit|Data Science| 10000|
|Basit|          ML|  5000|
|  Ali|    Big Data|  4000|
|Basit|    Big Data|  4000|
|  Ali|Data Science|  3000|
|Ahmed|Data Science| 20000|
|Ahmed|          ML| 10000|
|Ahmed|    Big Data|  5000|
|Sunny|Data Science| 10000|
|Sunny|    Big Data|  2000|
+-----+------------+------+



In [21]:
# Sum of salary of all departments
df.groupBy("Departments").sum("salary").show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|    Big Data|      15000|
|          ML|      15000|
|Data Science|      43000|
+------------+-----------+



In [51]:
# Max salary w.r.t departments
df.groupBy("Departments").max("salary").show()

+------------+-----------+
| Departments|max(salary)|
+------------+-----------+
|    Big Data|       5000|
|          ML|      10000|
|Data Science|      20000|
+------------+-----------+



In [105]:
df.createOrReplaceTempView("EMP")
query="SELECT Departments, MAX(struct(salary,Name)) AS max_struct FROM EMP GROUP BY Departments;"
spark.sql(query).show()

+------------+--------------+
| Departments|    max_struct|
+------------+--------------+
|    Big Data| {5000, Ahmed}|
|Data Science|{20000, Ahmed}|
|          ML|{10000, Ahmed}|
+------------+--------------+



In [106]:
# Fetch Complete rows of maximum salary w.r.t departments using Pyspark SQL
sql_str="SELECT e.max_struct.Name AS Name, e.Departments, e.max_struct.salary AS highest_salary"\
        " FROM ( SELECT Departments, MAX(struct(salary, Name)) AS max_struct FROM EMP GROUP BY Departments) AS e;"
spark.sql(sql_str).show()

+-----+------------+--------------+
| Name| Departments|highest_salary|
+-----+------------+--------------+
|Ahmed|    Big Data|          5000|
|Ahmed|Data Science|         20000|
|Ahmed|          ML|         10000|
+-----+------------+--------------+



In [104]:
# Fetch Complete rows of maximum salary w.r.t departments using GroupBy agg functions on DataFrame
result = df.groupBy("Departments").agg(F.max(F.struct("salary", "Name")).alias("max")) \
    .selectExpr("max.Name", "Departments", "max.salary as highest_salary")

result.show()

+-----+------------+--------------+
| Name| Departments|highest_salary|
+-----+------------+--------------+
|Ahmed|    Big Data|          5000|
|Ahmed|Data Science|         20000|
|Ahmed|          ML|         10000|
+-----+------------+--------------+



In [22]:
# Count w.r.t department
df.groupBy("Departments").count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|    Big Data|    4|
|          ML|    2|
|Data Science|    4|
+------------+-----+



In [28]:
# What is maximum salary
df.groupBy().max("salary").show()

+-----------+
|max(salary)|
+-----------+
|      20000|
+-----------+



In [32]:
# maximum salary of each individual
df.groupBy("Name").max("salary").show()

+-----+-----------+
| Name|max(salary)|
+-----+-----------+
|Basit|      10000|
|Sunny|      10000|
|Ahmed|      20000|
|  Ali|       4000|
+-----+-----------+

