In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('aggs').getOrCreate()

In [4]:
df = spark.read.csv('sales_info.csv' , inferSchema= True , header= True)

In [5]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [6]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [8]:
df.groupBy('company')

<pyspark.sql.group.GroupedData at 0x7fe8602daeb0>

In [9]:
df.groupBy('company').mean().show()

                    # max 
                    # min 
                    # count

+-------+-----------------+
|company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [12]:
 # aggrigate can be used if we not neccesarily want to group by, 
 # but we want all the ( count , max , mean ) of all the rows 


 df.agg(
     {
         'Sales' : 'sum' 
     }
 ).show()

+----------+
|sum(Sales)|
+----------+
|    4327.0|
+----------+



In [13]:
"""
we can combine group by and aggriagte methods 
"""

grouped_data = df.groupBy('company')

grouped_data.agg(
    {
        'Sales' : 'max'
    }
).show()

+-------+----------+
|company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [14]:
from pyspark.sql.functions import ( countDistinct , avg , stddev)

In [17]:
# how to use functions 

df.select([ countDistinct('Sales').alias('count sales'), 
            countDistinct('Company').alias('count company')]).show()

+-----------+-------------+
|count sales|count company|
+-----------+-------------+
|         11|            4|
+-----------+-------------+



In [18]:
df.select(avg('Sales')).show()

+-----------------+
|       avg(Sales)|
+-----------------+
|360.5833333333333|
+-----------------+



In [23]:
df.groupBy('Company')\
    .agg(
        {
        'Sales':'avg' , 
        'Company': 'count'}
).show()

+-------+--------------+-----------------+
|Company|count(Company)|       avg(Sales)|
+-------+--------------+-----------------+
|   APPL|             4|            370.0|
|   GOOG|             3|            220.0|
|     FB|             2|            610.0|
|   MSFT|             3|322.3333333333333|
+-------+--------------+-----------------+



In [24]:
df.select(stddev('Sales')).show()

+------------------+
|stddev_samp(Sales)|
+------------------+
|250.08742410799007|
+------------------+



In [20]:
# aliasing the column name 

df.select(stddev('Sales').alias('SD sales')).show()

+------------------+
|          SD sales|
+------------------+
|250.08742410799007|
+------------------+



In [25]:
# using format number 

from pyspark.sql.functions import format_number

In [26]:
sales_std = df.select(stddev('Sales').alias('STD'))

sales_std.select(format_number('STD' , 2).alias('STD')).show()

+------+
|   STD|
+------+
|250.09|
+------+



In [27]:
"""
Order and sort 
"""
# asc
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [30]:
# desc 

df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+

