# Aggregation
Three Types
## Simple - avg, count, min, max, sum
## Grouping - same as simple
## Windowing - lead lag rank dense_rank cust_dist
All aggregaions in spark implemented using built in functions

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql import functions as f

spark = SparkSession.builder \
            .master("local[3]") \
            .appName("Misc Transformations") \
            .getOrCreate()

In [2]:
invoice_df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("data/invoices.csv")

In [3]:
# Column Object Expression
invoice_df.select(f.count("*").alias("Count *"),
                      f.sum("Quantity").alias("TotalQuantity"),
                      f.avg("UnitPrice").alias("AvgPrice"),
                      f.countDistinct("InvoiceNo").alias("CountDistinct") #count distinct invoices
                      ).show()

+-------+-------------+-----------------+-------------+
|Count *|TotalQuantity|         AvgPrice|CountDistinct|
+-------+-------------+-----------------+-------------+
| 541909|      5176450|4.611113626088512|        25900|
+-------+-------------+-----------------+-------------+



In [4]:
#as sql expressions
invoice_df.selectExpr(
    "count(1) as `count 1`",              #count all including null
    "count(StockCode) as `count field`",  #count aviding null value in field
    "sum(Quantity) as TotalQuantity",
    "avg(UnitPrice) as AvgPrice"
    ).show()

+-------+-----------+-------------+-----------------+
|count 1|count field|TotalQuantity|         AvgPrice|
+-------+-----------+-------------+-----------------+
| 541909|     541908|      5176450|4.611113626086851|
+-------+-----------+-------------+-----------------+



Now we want summaise records all about country invoiveno qantity invoicevalue (invoice value to be calculated as qauntity multiply by unit price)
also group result by country

select country, InvoiceNo, sum(quantity) as TotalQuantity, round(quantity X unitprice,2) as invoicePice from tabl groupby country, Invoice no 

this is sql query. use it in spark sql. for that you need to create table from dataframe

In [5]:
invoice_df.createOrReplaceTempView("sales")
summary_sql = spark.sql("""
        SELECT Country, InvoiceNo,
            sum(Quantity) as TotalQuantity,
            round(sum(Quantity*UnitPrice),2) as InvoiceValue
        FROM sales
        GROUP BY Country, InvoiceNo""")

summary_sql.show()

+--------------+---------+-------------+------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|
+--------------+---------+-------------+------------+
|United Kingdom|   536446|          329|      440.89|
|United Kingdom|   536508|          216|      155.52|
|United Kingdom|   537018|           -3|         0.0|
|United Kingdom|   537401|          -24|         0.0|
|United Kingdom|   537811|           74|      268.86|
|United Kingdom|  C537824|           -2|       -14.9|
|United Kingdom|   538895|          370|      247.38|
|United Kingdom|   540453|          341|      302.45|
|United Kingdom|   541291|          217|      305.81|
|United Kingdom|   542551|           -1|         0.0|
|United Kingdom|   542576|           -1|         0.0|
|United Kingdom|   542628|            9|      132.35|
|United Kingdom|   542886|          199|      320.51|
|United Kingdom|   542907|           75|      313.85|
|United Kingdom|   543131|          134|       164.1|
|United Kingdom|   543189|  

In [6]:
# you can do same using dataframe expressio also
summary_df = invoice_df \
        .groupBy("Country", "InvoiceNo") \
        .agg(f.sum("Quantity").alias("TotalQuantity"),
             f.round(f.sum(f.expr("Quantity * UnitPrice")), 2).alias("InvoiceValue"),
             f.expr("round(sum(Quantity * UnitPrice),2) as InvoiceValueExpr")
             )

summary_df.show()

+--------------+---------+-------------+------------+----------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|InvoiceValueExpr|
+--------------+---------+-------------+------------+----------------+
|United Kingdom|   536446|          329|      440.89|          440.89|
|United Kingdom|   536508|          216|      155.52|          155.52|
|United Kingdom|   537018|           -3|         0.0|             0.0|
|United Kingdom|   537401|          -24|         0.0|             0.0|
|United Kingdom|   537811|           74|      268.86|          268.86|
|United Kingdom|  C537824|           -2|       -14.9|           -14.9|
|United Kingdom|   538895|          370|      247.38|          247.38|
|United Kingdom|   540453|          341|      302.45|          302.45|
|United Kingdom|   541291|          217|      305.81|          305.81|
|United Kingdom|   542551|           -1|         0.0|             0.0|
|United Kingdom|   542576|           -1|         0.0|             0.0|
|Unite

In [None]:
spark.stop()