In [0]:
from pyspark.sql.functions import *

order_items = spark.read.json('/public/retail_db_json/order_items')
orders = spark.read.json('/public/retail_db_json/orders')

In [0]:
# AGGREGATIONS

# Basic total aggregation
orders.select(count('*').alias('count'), sum('order_id').alias('sum')).show()

# Aggregation by specific column
order_items.groupBy('order_item_id').sum('order_item_quantity').show(n=10)

# Aggregation using .agg() method - far more flexible
order_items\
    .groupBy('order_item_id')\
    .agg(sum('order_item_quantity'), sum('order_item_subtotal').alias('order_revenue'), min('order_item_quantity'))\
    .show(n=5)

In [0]:
# Basic aggregation of whole dataset (Total Aggregation)

orders.select(count('*').alias('count'), sum('order_id').alias('sum')).show()

In [0]:
# .count() method can be used directly on a dataset

orders.count()

In [0]:
# .groupBy method creates GroupedData datatype.
# It allows us to use aggregate by specific groups.

type(order_items.groupBy())

In [0]:
# .groupBy() without an argument lets us aggregate the whole dataset (that can be aggregated with used function)
# For example: min() will not return the value for a string column

order_items.groupBy().min().show()

In [0]:
# .groupBy() with an argument lets us aggregate the dataset by specific columns
# .sum(*cols) will return the sum of all values for the given columns

order_items.groupBy('order_item_id').sum('order_item_quantity').show(n=10)

In [0]:
# But this syntax is quite limited - we can use only one aggr function

order_items \
    .groupBy('order_item_id') \
    .sum('order_item_quantity') \
    .min('order_item_quantity') \
    .show()

In [0]:
# And editing aggr column is also difficult

order_items\
    .groupBy('order_item_id')\
    .sum('order_item_quantity', 'order_item_subtotal')\
    .toDF('order_item_order_id', 'order_quantity', 'order_revenue')\
    .withColumn('order_revenue', round('order_revenue', 2))\
    .show(n=10)

In [0]:
# A better way is to use .agg() method
# Then we can use multiple aggr functions and edit the aggr columns on the fly

order_items\
    .groupBy('order_item_id')\
    .agg(sum('order_item_quantity'), sum('order_item_subtotal').alias('order_revenue'), min('order_item_quantity'))\
    .show(n=5)