# Group By and Aggregations

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType
import pyspark.sql.functions as F

spark = SparkSession.builder\
    .master('spark://192.168.2.102:7077')\
    .appName('Group By and Aggregations')\
    .config("spark.cores.max", "4")\
    .config("spark.executor.memory", "4g")\
    .getOrCreate()

21/12/22 16:54:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/22 16:54:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
temp_hist_schema = StructType([
    StructField('Date', DateType()),
    StructField('Id', IntegerType()),
    StructField('C', FloatType()),
])

beer_schema = StructType([
    StructField('Id', IntegerType()),
    StructField('InitialDate', DateType()),
    StructField('Type', StringType()),
])
 

temp_hist_df = spark.read.csv('data/beer_temp_hist.txt', sep=';', schema=temp_hist_schema)
beer_description_df = spark.read.csv('data/beer.csv', schema=beer_schema, header=True)

beer_df = beer_description_df.join(temp_hist_df, on='Id', how='inner')

beer_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+-----------+--------+----------+----+
| Id|InitialDate|    Type|      Date|   C|
+---+-----------+--------+----------+----+
|  1| 2021-12-01|   Laget|2021-12-01|20.0|
|  1| 2021-12-01|   Laget|2021-12-02|20.2|
|  1| 2021-12-01|   Laget|2021-12-03|null|
|  1| 2021-12-01|   Laget|2021-12-04|20.3|
|  1| 2021-12-01|   Laget|2021-12-05|20.5|
|  2| 2021-12-01|Pale Ale|2021-12-01|16.5|
|  2| 2021-12-01|Pale Ale|2021-12-02|16.4|
|  2| 2021-12-01|Pale Ale|2021-12-03|16.5|
|  2| 2021-12-01|Pale Ale|2021-12-04|null|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.8|
|  2| 2021-12-01|Pale Ale|2021-12-05|16.7|
|  3| 2021-12-01|    null|2021-12-01|18.3|
|  3| 2021-12-01|    null|2021-12-02|18.4|
|  3| 2021-12-01|    null|2021-12-03|null|
|  4| 2021-12-01|     Ipa|2021-12-01|18.2|
+---+-----------+--------+----------+----+



                                                                                

In [3]:
beer_df_grouped = beer_df.groupBy('Id')

beer_df_grouped.count().show()
beer_df_grouped.min('C').show()
beer_df_grouped.max('C').show()
beer_df_grouped.mean('C').show()

                                                                                

+---+-----+
| Id|count|
+---+-----+
|  1|    5|
|  3|    3|
|  4|    1|
|  2|    6|
+---+-----+



                                                                                

+---+------+
| Id|min(C)|
+---+------+
|  1|  20.0|
|  3|  18.3|
|  4|  18.2|
|  2|  16.4|
+---+------+



                                                                                

+---+------+
| Id|max(C)|
+---+------+
|  1|  20.5|
|  3|  18.4|
|  4|  18.2|
|  2|  16.8|
+---+------+

+---+------------------+
| Id|            avg(C)|
+---+------------------+
|  1|             20.25|
|  3| 18.34999942779541|
|  4|18.200000762939453|
|  2|16.579999923706055|
+---+------------------+



In [4]:
beer_df.groupBy('Id').agg(F.count('Id'), F.min('C'), F.max('C')).show()

+---+---------+------+------+
| Id|count(Id)|min(C)|max(C)|
+---+---------+------+------+
|  1|        5|  20.0|  20.5|
|  3|        3|  18.3|  18.4|
|  4|        1|  18.2|  18.2|
|  2|        6|  16.4|  16.8|
+---+---------+------+------+

