## Aggregation on Spark Dataframe

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession. \
        builder. \
        appName('AggregatingSparkDF'). \
        getOrCreate()

In [3]:
orders = spark.read.csv('../data/orders.csv', 
        schema='order_id INT, order_date STRING, order_customer_id INT, order_status STRING')

In [4]:
orders.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

In [5]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
order_items = spark.read.csv('../data/order_items.csv', header=True, inferSchema=True)

In [7]:
order_items.show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            1|                  1|                  957|                  1|             299.98|                  299.98|
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|            3|                  2|                  502|                  5|              250.0|                    50.0|
|            4|                  2|                  403|                  1|             129.99|                  129.99|
|            5|                  4|                  897|                  2|              49.98|                   24.99|
|            6| 

In [8]:
order_items.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- order_item_product_id: integer (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_subtotal: double (nullable = true)
 |-- order_item_product_price: double (nullable = true)



Here are the common aggregate functions that are available as part of `pyspark.sql.functions`

* `count`
* `sum`
* `min`
* `max`
* `avg`

In [9]:
help(count)

Help on function count in module pyspark.sql.functions:

count(col)
    Aggregate function: returns the number of items in a group.
    
    .. versionadded:: 1.3



In [10]:
orders.select(count("*")).show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



In [11]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
orders.groupBy('order_status').agg(count("*")).show()

+---------------+--------+
|   order_status|count(1)|
+---------------+--------+
|PENDING_PAYMENT|   15030|
|       COMPLETE|   22899|
|        ON_HOLD|    3798|
| PAYMENT_REVIEW|     729|
|     PROCESSING|    8275|
|         CLOSED|    7556|
|SUSPECTED_FRAUD|    1558|
|        PENDING|    7610|
|       CANCELED|    1428|
+---------------+--------+



* Get revenue using `order_item_subtotal` for a given `order_item_order_id`

In [13]:
order_items.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- order_item_product_id: integer (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_subtotal: double (nullable = true)
 |-- order_item_product_price: double (nullable = true)



In [14]:
help(sum)

Help on function sum in module pyspark.sql.functions:

sum(col)
    Aggregate function: returns the sum of all values in the expression.
    
    .. versionadded:: 1.3



In [15]:
order_items.filter("order_item_order_id = 2").show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|            3|                  2|                  502|                  5|              250.0|                    50.0|
|            4|                  2|                  403|                  1|             129.99|                  129.99|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+



In [16]:
order_items.filter("order_item_order_id == 2").select(sum("order_item_subtotal").alias("order_revenue")).show()

+-------------+
|order_revenue|
+-------------+
|       579.98|
+-------------+



* Get number of items, total quantity as well as revenue for a given order item order id
    * Number of items can be computed using count on `order_item_quantity`.
    * Total quantity can be computed using `sum` on `order_item_quantity`.
    * Total revenue can be computed using `sum` on `order_item_subtotal`.

In [17]:
order_items.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- order_item_product_id: integer (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_subtotal: double (nullable = true)
 |-- order_item_product_price: double (nullable = true)



In [18]:
order_items.filter("order_item_order_id == 2"). \
select(count("order_item_quantity").alias("order_item_count"), \
sum("order_item_quantity").alias("order_quantity"), \
sum("order_item_subtotal").alias("order_revenue")). \
show()

+----------------+--------------+-------------+
|order_item_count|order_quantity|order_revenue|
+----------------+--------------+-------------+
|               3|             7|       579.98|
+----------------+--------------+-------------+



* Get the number of records in order_items

In [19]:
# Another approach
# Function count on dataframe is an action, It will get triggered immediately
order_items.count()

172198

In [20]:
type(order_items.count())

int

In [21]:
# Create a DataFrame with this approach
order_items.select(count("*"))

DataFrame[count(1): bigint]

**NOTE:**

* There are two ways to use count: 1. Action count 2. Dataframe count
* In `action count`, you can call count method directly on dataframe, It will run immediately.
* In `dataframe count`, you use select the count the records in dataframe, execution will be triggered once you call action method such as `show()`.
* `count()` is a wide transformation.

#### GroupBy

In [22]:
help(order_items.groupBy)

Help on method groupBy in module pyspark.sql.dataframe:

groupBy(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Groups the :class:`DataFrame` using the specified columns,
    so we can run aggregation on them. See :class:`GroupedData`
    for all the available aggregate functions.
    
    :func:`groupby` is an alias for :func:`groupBy`.
    
    :param cols: list of columns to group by.
        Each element should be a column name (string) or an expression (:class:`Column`).
    
    >>> df.groupBy().avg().collect()
    [Row(avg(age)=3.5)]
    >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(df.name).avg().collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(['name', df.age]).count().collect())
    [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]
    
    .. versionadded:: 1.3



In [23]:
order_items.show(2)

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            1|                  1|                  957|                  1|             299.98|                  299.98|
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
only showing top 2 rows



In [24]:
order_items.groupBy().min().show()

+------------------+------------------------+--------------------------+------------------------+------------------------+-----------------------------+
|min(order_item_id)|min(order_item_order_id)|min(order_item_product_id)|min(order_item_quantity)|min(order_item_subtotal)|min(order_item_product_price)|
+------------------+------------------------+--------------------------+------------------------+------------------------+-----------------------------+
|                 1|                       1|                        19|                       1|                    9.99|                         9.99|
+------------------+------------------------+--------------------------+------------------------+------------------------+-----------------------------+



In [25]:
orders.dtypes

[('order_id', 'int'),
 ('order_date', 'string'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [26]:
orders.groupBy().min().show()

+-------------+----------------------+
|min(order_id)|min(order_customer_id)|
+-------------+----------------------+
|            1|                     1|
+-------------+----------------------+



#### Perform Grouped Aggregations using Direct Functions

In [27]:
order_item_grouped = order_items.groupBy('order_item_order_id')

In [29]:
type(order_item_grouped)

pyspark.sql.group.GroupedData

In [30]:
order_item_grouped.count().show()

+-------------------+-----+
|order_item_order_id|count|
+-------------------+-----+
|                148|    3|
|                463|    4|
|                471|    2|
|                496|    5|
|               1088|    2|
|               1580|    1|
|               1591|    3|
|               1645|    5|
|               2366|    1|
|               2659|    5|
|               2866|    4|
|               3175|    2|
|               3749|    1|
|               3794|    1|
|               3918|    4|
|               3997|    2|
|               4101|    1|
|               4519|    1|
|               4818|    1|
|               4900|    2|
+-------------------+-----+
only showing top 20 rows



In [33]:
order_items.dtypes

[('order_item_id', 'int'),
 ('order_item_order_id', 'int'),
 ('order_item_product_id', 'int'),
 ('order_item_quantity', 'int'),
 ('order_item_subtotal', 'double'),
 ('order_item_product_price', 'double')]

In [31]:
# Get sum of all numeric fields
order_item_grouped. \
sum().show()

+-------------------+------------------+------------------------+--------------------------+------------------------+------------------------+-----------------------------+
|order_item_order_id|sum(order_item_id)|sum(order_item_order_id)|sum(order_item_product_id)|sum(order_item_quantity)|sum(order_item_subtotal)|sum(order_item_product_price)|
+-------------------+------------------+------------------------+--------------------------+------------------------+------------------------+-----------------------------+
|                148|              1047|                     444|                      1407|                       8|                  479.99|                       229.99|
|                463|              4522|                    1852|                      1685|                      13|       829.9200000000001|           249.97000000000003|
|                471|              2307|                     942|                      1030|                       2|      169.98000000

In [34]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [35]:
# Get sum of all numeric fields
# Ignored order_date and order_status as they are not numeric fields
orders. \
groupBy('order_date'). \
sum(). \
show()

+--------------------+-------------+----------------------+
|          order_date|sum(order_id)|sum(order_customer_id)|
+--------------------+-------------+----------------------+
|2013-08-13 00:00:...|       916059|                459650|
|2013-10-12 00:00:...|      3370470|               1002372|
|2013-11-15 00:00:...|      3349122|                820656|
|2014-03-19 00:00:...|      5940291|                816861|
|2014-04-26 00:00:...|     11823779|               1546739|
|2013-09-16 00:00:...|      3418732|                795738|
|2013-09-20 00:00:...|      2247287|                851705|
|2013-12-31 00:00:...|      7838750|               1619361|
|2013-09-06 00:00:...|      4111583|               1692297|
|2014-06-15 00:00:...|      6940631|                827674|
|2013-12-24 00:00:...|      4918816|               1089616|
|2014-01-07 00:00:...|      4765725|               1017046|
|2014-06-07 00:00:...|     10238004|               1185855|
|2013-10-14 00:00:...|      3535766|    

In [36]:
order_items_grouped = order_items. \
select('order_item_order_id', 'order_item_quantity', 'order_item_subtotal'). \
groupBy('order_item_order_id')

In [37]:
help(order_items_grouped.sum)

Help on method sum in module pyspark.sql.group:

sum(*cols) method of pyspark.sql.group.GroupedData instance
    Compute the sum for each numeric columns for each group.
    
    :param cols: list of column names (string). Non-numeric columns are ignored.
    
    >>> df.groupBy().sum('age').collect()
    [Row(sum(age)=7)]
    >>> df3.groupBy().sum('age', 'height').collect()
    [Row(sum(age)=7, sum(height)=165)]
    
    .. versionadded:: 1.3



In [38]:
order_items_grouped. \
sum(). \
show()

+-------------------+------------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_order_id)|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+------------------------+
|                148|                     444|                       8|                  479.99|
|                463|                    1852|                      13|       829.9200000000001|
|                471|                     942|                       2|      169.98000000000002|
|                496|                    2480|                       7|      441.95000000000005|
|               1088|                    2176|                       3|      249.97000000000003|
|               1580|                    1580|                       5|                  299.95|
|               1591|                    4773|                      10|                  439.86|
|               1645|         

In [39]:
# Consider only order_item_quantity and order_item_subtotal
order_items_grouped. \
sum('order_item_quantity', 'order_item_subtotal'). \
show()

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+
|                148|                       8|                  479.99|
|                463|                      13|       829.9200000000001|
|                471|                       2|      169.98000000000002|
|                496|                       7|      441.95000000000005|
|               1088|                       3|      249.97000000000003|
|               1580|                       5|                  299.95|
|               1591|                      10|                  439.86|
|               1645|                      14|      1509.7900000000002|
|               2366|                       3|                  299.97|
|               2659|                       8|       724.9100000000001|
|               2866|                       5|                  

In [44]:
order_items_grouped. \
sum('order_item_quantity', 'order_item_subtotal'). \
toDF('order_item_order_id', 'order_quantity', 'order_revenue'). \
withColumn('order_revenue', round('order_revenue', 2)). \
show()

+-------------------+--------------+-------------+
|order_item_order_id|order_quantity|order_revenue|
+-------------------+--------------+-------------+
|                148|             8|       479.99|
|                463|            13|       829.92|
|                471|             2|       169.98|
|                496|             7|       441.95|
|               1088|             3|       249.97|
|               1580|             5|       299.95|
|               1591|            10|       439.86|
|               1645|            14|      1509.79|
|               2366|             3|       299.97|
|               2659|             8|       724.91|
|               2866|             5|       569.96|
|               3175|             4|       209.97|
|               3749|             3|       143.97|
|               3794|             5|       299.95|
|               3918|             9|       829.93|
|               3997|             4|       579.95|
|               4101|          

#### Perform Grouped Aggregations using Agg Functions

If you want to perform many aggregations like min, sum, max etc simpultaneously you won't be able to do it based on above method, Instead you have to use `agg()`

In [45]:
help(order_items_grouped.agg)

Help on method agg in module pyspark.sql.group:

agg(*exprs) method of pyspark.sql.group.GroupedData instance
    Compute aggregates and returns the result as a :class:`DataFrame`.
    
    The available aggregate functions can be:
    
    1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`
    
    2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`
    
       .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
           a full shuffle is required. Also, all the data of a group will be loaded into
           memory, so the user should be aware of the potential OOM risk if data is skewed
           and certain groups are too large to fit in memory.
    
       .. seealso:: :func:`pyspark.sql.functions.pandas_udf`
    
    If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
    is the column to perform aggregation on, and the value is the aggregate function.
    
    Alt

In [46]:
order_item_grouped. \
agg(sum('order_item_quantity'), sum('order_item_subtotal')). \
printSchema()

root
 |-- order_item_order_id: integer (nullable = true)
 |-- sum(order_item_quantity): long (nullable = true)
 |-- sum(order_item_subtotal): double (nullable = true)



In [47]:
order_item_grouped. \
agg(sum('order_item_quantity'), sum('order_item_subtotal')). \
show()

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+
|                148|                       8|                  479.99|
|                463|                      13|       829.9200000000001|
|                471|                       2|      169.98000000000002|
|                496|                       7|      441.95000000000005|
|               1088|                       3|      249.97000000000003|
|               1580|                       5|                  299.95|
|               1591|                      10|                  439.86|
|               1645|                      14|      1509.7900000000002|
|               2366|                       3|                  299.97|
|               2659|                       8|       724.9100000000001|
|               2866|                       5|                  

In [48]:
# With agg(), we can use alias and round as well
order_item_grouped. \
agg(sum('order_item_quantity').alias('order_quantity'), round(sum('order_item_subtotal'), 2).alias('order_revenue')). \
show()

+-------------------+--------------+-------------+
|order_item_order_id|order_quantity|order_revenue|
+-------------------+--------------+-------------+
|                148|             8|       479.99|
|                463|            13|       829.92|
|                471|             2|       169.98|
|                496|             7|       441.95|
|               1088|             3|       249.97|
|               1580|             5|       299.95|
|               1591|            10|       439.86|
|               1645|            14|      1509.79|
|               2366|             3|       299.97|
|               2659|             8|       724.91|
|               2866|             5|       569.96|
|               3175|             4|       209.97|
|               3749|             3|       143.97|
|               3794|             5|       299.95|
|               3918|             9|       829.93|
|               3997|             4|       579.95|
|               4101|          

In [49]:
# No round off in this approach
order_item_grouped. \
agg({'order_item_quantity': 'sum', 'order_item_subtotal': 'sum'}). \
show()

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_subtotal)|sum(order_item_quantity)|
+-------------------+------------------------+------------------------+
|                148|                  479.99|                       8|
|                463|       829.9200000000001|                      13|
|                471|      169.98000000000002|                       2|
|                496|      441.95000000000005|                       7|
|               1088|      249.97000000000003|                       3|
|               1580|                  299.95|                       5|
|               1591|                  439.86|                      10|
|               1645|      1509.7900000000002|                      14|
|               2366|                  299.97|                       3|
|               2659|       724.9100000000001|                       8|
|               2866|                  569.96|                  

In [51]:
order_item_grouped. \
agg({'order_item_quantity': 'sum', 'order_item_subtotal': 'sum'}). \
toDF('order_item_order_id', 'order_revenue', 'order_quantity'). \
withColumn('order_revenue', round('order_revenue', 2)). \
show()

+-------------------+-------------+--------------+
|order_item_order_id|order_revenue|order_quantity|
+-------------------+-------------+--------------+
|                148|       479.99|             8|
|                463|       829.92|            13|
|                471|       169.98|             2|
|                496|       441.95|             7|
|               1088|       249.97|             3|
|               1580|       299.95|             5|
|               1591|       439.86|            10|
|               1645|      1509.79|            14|
|               2366|       299.97|             3|
|               2659|       724.91|             8|
|               2866|       569.96|             5|
|               3175|       209.97|             4|
|               3749|       143.97|             3|
|               3794|       299.95|             5|
|               3918|       829.93|             9|
|               3997|       579.95|             4|
|               4101|       129

In [52]:
# Won't give both - sum and min, Keys should be unique in dict
order_item_grouped. \
agg({'order_item_quantity': 'sum', 'order_item_quantity': 'min'}). \
show()

+-------------------+------------------------+
|order_item_order_id|min(order_item_quantity)|
+-------------------+------------------------+
|                148|                       1|
|                463|                       1|
|                471|                       1|
|                496|                       1|
|               1088|                       1|
|               1580|                       5|
|               1591|                       1|
|               1645|                       1|
|               2366|                       3|
|               2659|                       1|
|               2866|                       1|
|               3175|                       1|
|               3749|                       3|
|               3794|                       5|
|               3918|                       1|
|               3997|                       1|
|               4101|                       1|
|               4519|                       2|
|            

In [53]:
# Use this approach
order_item_grouped. \
agg( \
    sum('order_item_quantity').alias('order_quantity'), 
    min('order_item_quantity').alias('min_order_quantity'),
    round(sum('order_item_subtotal'), 2).alias('order_revenue')). \
show()

+-------------------+--------------+------------------+-------------+
|order_item_order_id|order_quantity|min_order_quantity|order_revenue|
+-------------------+--------------+------------------+-------------+
|                148|             8|                 1|       479.99|
|                463|            13|                 1|       829.92|
|                471|             2|                 1|       169.98|
|                496|             7|                 1|       441.95|
|               1088|             3|                 1|       249.97|
|               1580|             5|                 5|       299.95|
|               1591|            10|                 1|       439.86|
|               1645|            14|                 1|      1509.79|
|               2366|             3|                 3|       299.97|
|               2659|             8|                 1|       724.91|
|               2866|             5|                 1|       569.96|
|               3175