In [27]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, lit, when, \
desc, asc, cast, like, count, min, max, median, percentile
from pyspark.sql.types import *

spark = SparkSession.Builder().appName('agg()').getOrCreate()

In [28]:
data = [
    ('Ajay', 23, 3000, 'Data'),
    ('Rohit', 27, 2000, 'Data'),
    ('Dhananjay', 27, 2000, 'Data'),
    ('Hema', 26, 2000, 'HR'),
    ('Huedsad', 26, 1233, 'PayRoll'),
]
schema = ['name', 'age', 'salary', 'dept']

df = spark.createDataFrame(data, schema)
df.show()

+---------+---+------+-------+
|     name|age|salary|   dept|
+---------+---+------+-------+
|     Ajay| 23|  3000|   Data|
|    Rohit| 27|  2000|   Data|
|Dhananjay| 27|  2000|   Data|
|     Hema| 26|  2000|     HR|
|  Huedsad| 26|  1233|PayRoll|
+---------+---+------+-------+



In [29]:
df.groupBy('dept').count().show()
df.groupBy('dept').max('salary').show()
df.groupBy('dept').min('salary').show()

+-------+-----+
|   dept|count|
+-------+-----+
|   Data|    3|
|     HR|    1|
|PayRoll|    1|
+-------+-----+

+-------+-----------+
|   dept|max(salary)|
+-------+-----------+
|   Data|       3000|
|     HR|       2000|
|PayRoll|       1233|
+-------+-----------+

+-------+-----------+
|   dept|min(salary)|
+-------+-----------+
|   Data|       2000|
|     HR|       2000|
|PayRoll|       1233|
+-------+-----------+



In [31]:
df.groupBy('dept').agg(
    count('*').alias('countOfEmp'),\
    min('salary').alias('minSal'),\
    max('salary').alias('maxSal'),\
    median('salary').alias('medianSal'),\
    percentile('salary',.90).alias('P90Sal'),\
).show()

+-------+----------+------+------+---------+------+
|   dept|countOfEmp|minSal|maxSal|medianSal|P90Sal|
+-------+----------+------+------+---------+------+
|   Data|         3|  2000|  3000|   2000.0|2800.0|
|     HR|         1|  2000|  2000|   2000.0|2000.0|
|PayRoll|         1|  1233|  1233|   1233.0|1233.0|
+-------+----------+------+------+---------+------+



In [32]:
spark.stop()