## 03-GroupBy_and_Aggregate_Functions

In [0]:
# 03-GroupBy_and_Aggregate_Functions
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExamples").getOrCreate()

In [0]:
# Let Spark know about the header and infer the Schema types! 
# Infer scehma could be done with .csv file not with .json files
df = spark.read.csv('dbfs:/FileStore/tables/sales_info.csv', inferSchema = True, header = True)
df.printSchema()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
print(df.head(2))
df.show()

root
 |-- Company: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- Sales: integer (nullable = true)

DataFrame columns are: ['Company', 'Branch', 'Person', 'Qty', 'Sales'] with column count: 5 and with row count: 12
[Row(Company='GOOG', Branch='NY', Person='Sam', Qty=1000, Sales=200), Row(Company='GOOG', Branch='NY', Person='Charlie', Qty=4000, Sales=120)]
+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|   GOOG|    NY|    Sam|1000|  200|
|   GOOG|    NY|Charlie|4000|  120|
|   GOOG|    CA|  Frank|6000|  340|
|   MSFT|    CA|   Tina|7000|  600|
|   MSFT|    NY|    Amy|2000|  124|
|   MSFT|    CA|Vanessa|2500|  243|
|     FB|    CA|   Carl|9000|  870|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|   John|7000|  250|
|   APPL|    NY|  Linda|9500|  130|
|   APPL|    NY|   Mike|1000|  750|
|   APPL|    CA|  Chris|7600|  350|
+------

In [0]:
# group together by company
print(df.groupBy("Company"))
df.groupBy("Company").mean().show()
df.groupBy("Company").count().show()
df.groupBy("Company").agg({'Sales':'sum'}).show()

<pyspark.sql.group.GroupedData object at 0x7f970da090a0>
+-------+------------------+-----------------+
|Company|          avg(Qty)|       avg(Sales)|
+-------+------------------+-----------------+
|   APPL|            6275.0|            370.0|
|   GOOG|3666.6666666666665|            220.0|
|     FB|            6025.0|            610.0|
|   MSFT|3833.3333333333335|322.3333333333333|
+-------+------------------+-----------------+

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|      1480|
|   GOOG|       660|
|     FB|      1220|
|   MSFT|       967|
+-------+----------+



In [0]:
df.groupBy("Company").sum("Sales", "Qty").show()
df.groupBy("Company", "Branch").sum("Qty", "Sales").show()

+-------+----------+--------+
|Company|sum(Sales)|sum(Qty)|
+-------+----------+--------+
|   APPL|      1480|   25100|
|   GOOG|       660|   11000|
|     FB|      1220|   12050|
|   MSFT|       967|   11500|
+-------+----------+--------+

+-------+------+--------+----------+
|Company|Branch|sum(Qty)|sum(Sales)|
+-------+------+--------+----------+
|   GOOG|    NY|    5000|       320|
|   GOOG|    CA|    6000|       340|
|     FB|    NY|    3050|       350|
|   APPL|    CA|   14600|       600|
|   MSFT|    CA|    9500|       843|
|     FB|    CA|    9000|       870|
|   MSFT|    NY|    2000|       124|
|   APPL|    NY|   10500|       880|
+-------+------+--------+----------+



In [0]:
from pyspark.sql.functions import sum, avg, max
df.groupby("Company").agg(sum("Sales").alias("Sum_Sales")).show()
df.groupBy("Company").agg({'Sales':'sum'}).show()

+-------+---------+
|Company|Sum_Sales|
+-------+---------+
|   APPL|     1480|
|   GOOG|      660|
|     FB|     1220|
|   MSFT|      967|
+-------+---------+

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|      1480|
|   GOOG|       660|
|     FB|      1220|
|   MSFT|       967|
+-------+----------+



In [0]:
from pyspark.sql.functions import sum, avg, max, min, mean, count
df.groupBy("Company") \
        .agg(sum("Sales").alias("Sum_Sales"), \
         avg("Sales").alias("Average_Sales"), \
         sum("Qty").alias("Sum_Quantity"), \
         max("Qty").alias("Max_Quantity"), \
         min("Qty").alias("Min_Quantity"), \
         count("Qty").alias("Count_Quantit"), \
     ).show(truncate = False)

+-------+---------+-----------------+------------+------------+------------+-------------+
|Company|Sum_Sales|Average_Sales    |Sum_Quantity|Max_Quantity|Min_Quantity|Count_Quantit|
+-------+---------+-----------------+------------+------------+------------+-------------+
|APPL   |1480     |370.0            |25100       |9500        |1000        |4            |
|GOOG   |660      |220.0            |11000       |6000        |1000        |3            |
|FB     |1220     |610.0            |12050       |9000        |3050        |2            |
|MSFT   |967      |322.3333333333333|11500       |7000        |2000        |3            |
+-------+---------+-----------------+------------+------------+------------+-------------+



In [0]:
from pyspark.sql.functions import sum, avg, max, min, mean, count, col
df.groupBy("Company") \
        .agg(sum("Sales").alias("Sum_Sales"), \
         avg("Sales").alias("Average_Sales"), \
         sum("Qty").alias("Sum_Quantity"), \
         max("Qty").alias("Max_Quantity"), \
         min("Qty").alias("Min_Quantity"), \
         count("Qty").alias("Count_Quantit")) \
      .where(col("Sum_Quantity") >= 12000) \
      .show(truncate = False)

+-------+---------+-------------+------------+------------+------------+-------------+
|Company|Sum_Sales|Average_Sales|Sum_Quantity|Max_Quantity|Min_Quantity|Count_Quantit|
+-------+---------+-------------+------------+------------+------------+-------------+
|APPL   |1480     |370.0        |25100       |9500        |1000        |4            |
|FB     |1220     |610.0        |12050       |9000        |3050        |2            |
+-------+---------+-------------+------------+------------+------------+-------------+



In [0]:
# Ranking
from pyspark.sql.window import Window
import pyspark.sql.functions as F
windowSpec = Window().partitionBy(['Company']).orderBy(F.desc('Sales'))
df.withColumn("rank", F.rank().over(windowSpec)).show()

windowSpec = Window().partitionBy(['Company']).orderBy(F.asc('Sales'))
df.withColumn("rank", F.rank().over(windowSpec)).show()

+-------+------+-------+----+-----+----+
|Company|Branch| Person| Qty|Sales|rank|
+-------+------+-------+----+-----+----+
|   APPL|    NY|   Mike|1000|  750|   1|
|   APPL|    CA|  Chris|7600|  350|   2|
|   APPL|    CA|   John|7000|  250|   3|
|   APPL|    NY|  Linda|9500|  130|   4|
|     FB|    CA|   Carl|9000|  870|   1|
|     FB|    NY|  Sarah|3050|  350|   2|
|   GOOG|    CA|  Frank|6000|  340|   1|
|   GOOG|    NY|    Sam|1000|  200|   2|
|   GOOG|    NY|Charlie|4000|  120|   3|
|   MSFT|    CA|   Tina|7000|  600|   1|
|   MSFT|    CA|Vanessa|2500|  243|   2|
|   MSFT|    NY|    Amy|2000|  124|   3|
+-------+------+-------+----+-----+----+

+-------+------+-------+----+-----+----+
|Company|Branch| Person| Qty|Sales|rank|
+-------+------+-------+----+-----+----+
|   APPL|    NY|  Linda|9500|  130|   1|
|   APPL|    CA|   John|7000|  250|   2|
|   APPL|    CA|  Chris|7600|  350|   3|
|   APPL|    NY|   Mike|1000|  750|   4|
|     FB|    NY|  Sarah|3050|  350|   1|
|     FB|    CA

## Functions
There are a variety of functions you can import from pyspark.sql.functions. Check out the documentation for the full list available:
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions

In [0]:
# Other functions
import pyspark.sql.functions as F
from pyspark.sql.functions import countDistinct, avg, stddev
sales_std = df.select(stddev(df['Sales']).alias("std"))
sales_std.show()
sales_std.select(F.format_number('std', 2)).show() # format_number("col_name", decimal places)
sales_std.select(F.format_number('std', 2).alias("formatted_stddev")).show()

+------------------+
|               std|
+------------------+
|250.08742410799007|
+------------------+

+---------------------+
|format_number(std, 2)|
+---------------------+
|               250.09|
+---------------------+

+----------------+
|formatted_stddev|
+----------------+
|          250.09|
+----------------+



In [0]:
df.select(countDistinct(df['Sales']), countDistinct("Sales").alias("Count_Distinct_Sales")).show()
df.select(stddev("Sales"), avg('Sales'), F.format_number(avg("Sales"), 2).alias("Average_Sales")).show()
df.select(countDistinct("Sales").alias("Distinct Sales")).show()

+---------------------+--------------------+
|count(DISTINCT Sales)|Count_Distinct_Sales|
+---------------------+--------------------+
|                   11|                  11|
+---------------------+--------------------+

+------------------+-----------------+-------------+
|stddev_samp(Sales)|       avg(Sales)|Average_Sales|
+------------------+-----------------+-------------+
|250.08742410799007|360.5833333333333|       360.58|
+------------------+-----------------+-------------+

+--------------+
|Distinct Sales|
+--------------+
|            11|
+--------------+



In [0]:
# OrderBy
# By default ascending
df.orderBy(df["Sales"]).show()
df.orderBy("Sales").show()
df.sort("sales").show()

+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|   GOOG|    NY|Charlie|4000|  120|
|   MSFT|    NY|    Amy|2000|  124|
|   APPL|    NY|  Linda|9500|  130|
|   GOOG|    NY|    Sam|1000|  200|
|   MSFT|    CA|Vanessa|2500|  243|
|   APPL|    CA|   John|7000|  250|
|   GOOG|    CA|  Frank|6000|  340|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|  Chris|7600|  350|
|   MSFT|    CA|   Tina|7000|  600|
|   APPL|    NY|   Mike|1000|  750|
|     FB|    CA|   Carl|9000|  870|
+-------+------+-------+----+-----+

+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|   GOOG|    NY|Charlie|4000|  120|
|   MSFT|    NY|    Amy|2000|  124|
|   APPL|    NY|  Linda|9500|  130|
|   GOOG|    NY|    Sam|1000|  200|
|   MSFT|    CA|Vanessa|2500|  243|
|   APPL|    CA|   John|7000|  250|
|   GOOG|    CA|  Frank|6000|  340|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|  Chris|760

In [0]:
# Descending order of Sales
import pyspark.sql.functions as F
df.orderBy(df["Sales"].desc()).show()

+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|     FB|    CA|   Carl|9000|  870|
|   APPL|    NY|   Mike|1000|  750|
|   MSFT|    CA|   Tina|7000|  600|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|  Chris|7600|  350|
|   GOOG|    CA|  Frank|6000|  340|
|   APPL|    CA|   John|7000|  250|
|   MSFT|    CA|Vanessa|2500|  243|
|   GOOG|    NY|    Sam|1000|  200|
|   APPL|    NY|  Linda|9500|  130|
|   MSFT|    NY|    Amy|2000|  124|
|   GOOG|    NY|Charlie|4000|  120|
+-------+------+-------+----+-----+



In [0]:
df.sort(F.desc("sales")).show()

+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|     FB|    CA|   Carl|9000|  870|
|   APPL|    NY|   Mike|1000|  750|
|   MSFT|    CA|   Tina|7000|  600|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|  Chris|7600|  350|
|   GOOG|    CA|  Frank|6000|  340|
|   APPL|    CA|   John|7000|  250|
|   MSFT|    CA|Vanessa|2500|  243|
|   GOOG|    NY|    Sam|1000|  200|
|   APPL|    NY|  Linda|9500|  130|
|   MSFT|    NY|    Amy|2000|  124|
|   GOOG|    NY|Charlie|4000|  120|
+-------+------+-------+----+-----+

