**Installation and importing necessary libraries**

In [None]:
pip install pyspark

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('group_and_agg').getOrCreate()

In [16]:
data = pd.read_csv('/content/drive/MyDrive/Spark_DataFrames/sales_info.csv')
data.head()

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,GOOG,Frank,340
3,MSFT,Tina,600
4,MSFT,Amy,124


In [14]:
data.dtypes


Company    object
Person     object
Sales       int64
dtype: object

**Using spark**

In [5]:
df = spark.read.csv('/content/drive/MyDrive/Spark_DataFrames/sales_info.csv',inferSchema=True,header=True)

In [6]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [7]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



# **Group By and Aggregate**

**Let's group by company**

In [8]:
df.groupBy('Company')

<pyspark.sql.group.GroupedData at 0x7fae11ebf610>

This returns a GroupedData object, off of which you can all various methods

In [24]:
# we expect 4 different companies

df.select('Company').distinct().count()

4

In [25]:
# count
df.groupBy('Company').count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [26]:
# mean
df.groupBy('Company').mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [27]:
# max
df.groupBy('Company').max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [28]:
# min
df.groupBy('Company').min().show()

+-------+----------+
|Company|min(Sales)|
+-------+----------+
|   APPL|     130.0|
|   GOOG|     120.0|
|     FB|     350.0|
|   MSFT|     124.0|
+-------+----------+



In [9]:
df.groupBy('Company').sum().show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+



Not all methods need a groupby call, instead you can just call the generalized .agg() method, that will call the aggregate across all rows in the dataframe column specified. It can take in arguments as a single column, or create multiple aggregate calls all at once using dictionary notation.

In [11]:
# max sales of all the Companies

df.agg({'Sales':'max'}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [13]:
# Could have done this on the group by object as well:
grouped = df.groupBy('company')
grouped.agg({'Sales':'max'}).show()

+-------+----------+
|company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



# **Functions**

In [21]:
from pyspark.sql.functions import countDistinct, avg, stddev, format_number

In [15]:
df.select(countDistinct('Company')).show()

+-----------------------+
|count(DISTINCT Company)|
+-----------------------+
|                      4|
+-----------------------+



In [17]:
# Using alias to change the name

df.select(countDistinct('Company').alias('Distinct Companies')).show()

+------------------+
|Distinct Companies|
+------------------+
|                 4|
+------------------+



In [19]:
# sales average

df.select(avg('Sales').alias('Average_Sales')).show()

+-----------------+
|    Average_Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [20]:
# standard deviation of sales

df.select(stddev('Sales').alias('Std_deviation')).show()

+------------------+
|     Std_deviation|
+------------------+
|250.08742410799007|
+------------------+



**Formating numbers**

In [24]:
sales_dev = df.select(stddev('Sales').alias('Std_deviation'))
sales_dev.select(format_number('Std_deviation',2)).show()

+-------------------------------+
|format_number(Std_deviation, 2)|
+-------------------------------+
|                         250.09|
+-------------------------------+



# **Order By**

In [29]:
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [30]:
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+

