In [2]:
import pyspark
from pyspark.sql import SparkSession

# Groupping and aggregating
We often do **groupby** and **aggregate functions** in the part of pre-proccesing our data

In [4]:
spark = SparkSession.builder.appName('GroupBy and aggregate').getOrCreate()
spark

In [6]:
df = spark.read.options(header=True, inferSchema=True, delimiter=';') \
                        .csv('data/tut05_test.csv')
df.show()

+------+--------+------+-------+
|  Name|Position|Salary|Country|
+------+--------+------+-------+
|Alexey|  Junior|    55| Russia|
| Robbi|  Senior|   155|  Spain|
|Muchai|  Middle|    90|  India|
|   Lia|  Middle|    95|  Italy|
|  Ivan|  Senior|   140| Russia|
| Torry|  Junior|    60|  Spain|
|  John|  Junior|    58|    USA|
+------+--------+------+-------+



In [7]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Country: string (nullable = true)



### GroupBy + aggregate

**GroupBy operations work together with aggregate functions:**
1) apply groupby operation  
2) apply aggregate function

In [9]:
df.groupBy('Position')

<pyspark.sql.group.GroupedData at 0x15f424a55b0>

In [10]:
df.groupBy('Position').max()

DataFrame[Position: string, max(Salary): int]

Aggregating function is applied to integer column

In [11]:
# maximum of `salary` column for each position
df.groupBy('Position').max().show()

+--------+-----------+
|Position|max(Salary)|
+--------+-----------+
|  Senior|        155|
|  Middle|         95|
|  Junior|         60|
+--------+-----------+



In [12]:
# sum of `salary` column for each position
df.groupBy('Position').sum().show()

+--------+-----------+
|Position|sum(Salary)|
+--------+-----------+
|  Senior|        295|
|  Middle|        185|
|  Junior|        173|
+--------+-----------+



In [15]:
# number of entries for each country
df.groupBy('Country').count().show()

+-------+-----+
|Country|count|
+-------+-----+
| Russia|    2|
|  India|    1|
|  Italy|    1|
|  Spain|    2|
|    USA|    1|
+-------+-----+



## Only aggregate

In [19]:
# find the max of `salary` column
df.agg({'Salary':'max'}).show()

+-----------+
|max(Salary)|
+-----------+
|        155|
+-----------+



In [25]:
# max of `name` column (??) 
df.agg({'Name':'max'}).show()

+---------+
|max(Name)|
+---------+
|    Torry|
+---------+

