In [1]:
import numpy as np
import pandas as pd

In [2]:
X = [['Customer1','a',1200,30],
     ['Customer1','b',1200,10],
     ['Customer2','a',1225,20],
     ['Customer1','a',1800,30],
     ['Customer3','b',1900,20],
     ['Customer2','c',1600,18],
     ['Customer3','c',1400,27],
     ['Customer2','c',1300,31],
     ['Customer2','c',1140,16],
    ]
df = pd.DataFrame(X,columns=['Customers','Outlet','Price','Quantity'])
df.head()

Unnamed: 0,Customers,Outlet,Price,Quantity
0,Customer1,a,1200,30
1,Customer1,b,1200,10
2,Customer2,a,1225,20
3,Customer1,a,1800,30
4,Customer3,b,1900,20


<div class="alert alert-block alert-info">
<b>Groupby type1:</b> Here we are grouping a particular column of a dataframe based on a parimary column and we use an aggregate function like mean,sum etc<br>
    Here we are finding the mean price of goods shopped by all the customers
</div>

In [3]:
pd.DataFrame(df.groupby('Customers').agg('mean')['Price'])

Unnamed: 0_level_0,Price
Customers,Unnamed: 1_level_1
Customer1,1400.0
Customer2,1316.25
Customer3,1650.0


However there is an alternative way to do this thing as well. I prefer using this in my work

In [4]:
df.groupby('Customers').agg({'Price':'mean'})

Unnamed: 0_level_0,Price
Customers,Unnamed: 1_level_1
Customer1,1400.0
Customer2,1316.25
Customer3,1650.0


It makes the work easy because we can incorporate aggregations in different columns using dictionaries 

In [5]:
df.groupby('Customers').agg({'Price'    :'mean',
                             'Quantity' :'sum', 
                             'Outlet'   :'count'   
                            })

Unnamed: 0_level_0,Price,Quantity,Outlet
Customers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Customer1,1400.0,70,3
Customer2,1316.25,85,4
Customer3,1650.0,47,2


<div class="alert alert-block alert-info">
<b>Groupby type2:</b> Group by using multiple indices <br>
    Here we are grouping by all the customers based on their average price of goods at different outlets. 
</div>

In [6]:
pd.DataFrame(df.groupby(['Customers','Outlet']).agg('mean')['Price'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Customers,Outlet,Unnamed: 2_level_1
Customer1,a,1500.0
Customer1,b,1200.0
Customer2,a,1225.0
Customer2,c,1346.666667
Customer3,b,1900.0
Customer3,c,1400.0


<div class="alert alert-block alert-info">
<b>Groupby type3:</b> Group by using multiple functions <br>
    Here we perform groupby using two aggregates - mean and sum on all the columns of the dataframe. <br>
    The output will be both the average price(and quantity) and aggregate price(and  quantity) corresponding to all customers
</div>

In [7]:
pd.DataFrame(df.groupby('Customers').agg(['mean','sum']))

Unnamed: 0_level_0,Price,Price,Quantity,Quantity
Unnamed: 0_level_1,mean,sum,mean,sum
Customers,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Customer1,1400.0,4200,23.333333,70
Customer2,1316.25,5265,21.25,85
Customer3,1650.0,3300,23.5,47


Again, we can use an alternative way to do the above tasks by using the same format from Type1. This way enables tracking to be easy in case we want to look back and see what aggregations were performed on what columns

In [8]:
df.groupby('Customers').agg({'Price'    :['mean','sum'],
                             'Quantity' :'sum', 
                             'Outlet'   :'count',
                             
                            })

Unnamed: 0_level_0,Price,Price,Quantity,Outlet
Unnamed: 0_level_1,mean,sum,sum,count
Customers,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Customer1,1400.0,4200,70,3
Customer2,1316.25,5265,85,4
Customer3,1650.0,3300,47,2


<div class="alert alert-block alert-info">
<b>Groupby type3(a):</b> Group by using multiple functions and multiple indices<br>
    Here we perform groupby using two aggregates - mean and sum on all the columns of the dataframe. The indices will be both Customers and Outlet<br>
    The output will be both the average price(and quantity) and aggregate price(and  quantity) corresponding to all customers accross all outlets
</div>

In [9]:
pd.DataFrame(df.groupby(['Customers','Outlet']).agg(['mean','sum']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Quantity,Quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,mean,sum
Customers,Outlet,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Customer1,a,1500.0,3000,30.0,60
Customer1,b,1200.0,1200,10.0,10
Customer2,a,1225.0,1225,20.0,20
Customer2,c,1346.666667,4040,21.666667,65
Customer3,b,1900.0,1900,20.0,20
Customer3,c,1400.0,1400,27.0,27


<div class="alert alert-block alert-info">
<b>Groupby type4:</b> We use GroupBy using user defined functions for aggregation. <br>
    We apply one inbuilt aggregation function on the column - 'Price' and one user defined function on the same column
</div>

In [10]:
def discounted_sum(pandas_series):
    '''
        Here the column corresponding to the index will be the input. For example if we consider Customer1, then pandas_series 
        will have a series consisting of row number 0,1,3 and column 'Price'. Our aim is to get the discounted
        aggregate of all Price paid by customer. We set discount rate to be 10%
    '''
    price_actual = pandas_series.values.tolist()
    return np.sum([value*0.9 for value in price_actual])

df.groupby('Customers').agg({'Price': ['sum', discounted_sum]})

Unnamed: 0_level_0,Price,Price
Unnamed: 0_level_1,sum,discounted_sum
Customers,Unnamed: 1_level_2,Unnamed: 2_level_2
Customer1,4200,3780.0
Customer2,5265,4738.5
Customer3,3300,2970.0


<div class="alert alert-block alert-info">
<b>Groupby type4(a):</b> We use GroupBy using user defined functions for aggregation accross different columns <br>
    We apply 2 inbuilt functions on two different columns along with few user defined functions
</div>

In [11]:
def free_quantity(pandas_series):
    '''
        Here the column corresponding to the index will be the input. For example if we consider Customer1, then pandas_series 
        will have a series consisting of row number 0,1,3 and column 'Price'. 
        Our aim is to ensure that the customer gets one free item when he pruchases atleast 20 items
    '''
    quantity_actual = pandas_series.values.tolist()
    return np.sum([value+1 if value>20 else value for value in quantity_actual])

df.groupby(['Customers','Outlet']).agg({'Price'    : ['sum', discounted_sum],
                             'Quantity' : ['sum',free_quantity]
                            })


Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Quantity,Quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,discounted_sum,sum,free_quantity
Customers,Outlet,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Customer1,a,3000,2700.0,60,62
Customer1,b,1200,1080.0,10,10
Customer2,a,1225,1102.5,20,20
Customer2,c,4040,3636.0,65,66
Customer3,b,1900,1710.0,20,20
Customer3,c,1400,1260.0,27,28


<div class="alert alert-block alert-info">
<b>Accessing elements after GroupBy:</b> We will just use the above table and see how to access values from a table attained after performing groupby operations. <br>
The table has clearly 2 indices and we want the see how much discounted price Customer1 got in outlet 'b' 
</div>

In [12]:
tdf = df.groupby(['Customers','Outlet']).agg({'Price'    : ['sum', discounted_sum],
                                              'Quantity' : ['sum',free_quantity]
                                            })
#### The dataframe as a result of GroupBy is stored in a variable 
print('The discounted price attained by Customer1 in outlet b will be:- ', tdf.loc['Customer1','b']['Price','discounted_sum'])

The discounted price attained by Customer1 in outlet b will be:-  1080.0
