
# groupby: split-apply-combine strategy

In [None]:
# To demonstrate the internal operation of groupby.
# Let's take an example, Assume we have a pandas dataframe named d f with two rows and seven columns, containing gender and weights of the person living in a flat, as shown here.

import pandas as pd
dic ={'gender': ['f', 'm', 'm', 'f', 'm', 'f', 'm'],
      'weight': [58, 60, 59, 55, 65, 52, 61]
      }
df =pd.DataFrame(dic)
df

Unnamed: 0,gender,weight
0,f,58
1,m,60
2,m,59
3,f,55
4,m,65
5,f,52
6,m,61


In [None]:
# Now, we split our data based on gender.
# In this case, we performed filter operation based on female and male.
# And assigned that filtered dataframe to separate variables, here we assigned filtered dataframe based on gender female to f filter variable and similary, for gender male to m filter variable.
# By running this line of code results in splitting of dataframe into two different group.

f_filter = df['gender']=='f'
print(df[f_filter])

m_filter = df['gender']=='m'
print(df[m_filter])

  gender  weight
0      f      58
3      f      55
5      f      52
  gender  weight
1      m      60
2      m      59
4      m      65
6      m      61


In [None]:
# After, the split process, let's apply aggregation based on mean. Here, we apply aggregation function mean for each group.
# And assigned to the separate variable, in this case, we assigned mean value for gender group female to f avg variable and similary, for gender group male to m avg variable.
# By running this blocks of code prints out average value for each gender group female and male.

f_avg = df[f_filter]['weight'].mean()

m_avg = df[m_filter]['weight'].mean()

print(f_avg,m_avg)


55.0 61.25


In [None]:
# Finally, we combine the results from both group to a single dataframe. For this, we created a dictionary with keys gender and weight, and assigned the average weigth resulted from previous cell to make a dataframe.
# By running this line of code creates a dataframe which is our final result. As we can see the contains mean weight for both gender.

pd.DataFrame({'Gender':['f','m'],'weight':[f_avg,m_avg]})


Unnamed: 0,Gender,weight
0,f,55.0
1,m,61.25


In [None]:
# We saw earlier the internal operation of groupby. We perform groupby operation directly using following steps.
# first, we apply groupby method with string gender as an argument.
# after that we apply  aggregate method where we put list of statistical operations,
# in this case we want to find out mean weight for each gender at once.
# by running this line of code results in a dataframe containing mean weight for each gender.

df.groupby('gender').agg(['mean'])

Unnamed: 0_level_0,weight
Unnamed: 0_level_1,mean
gender,Unnamed: 1_level_2
f,55.0
m,61.25


In [None]:
# Similarly, to perform multiple statistical operations that is the minimum, maximum, mean weight and sum of weight for each gender at once,
# we pass list of such operations in aggregation method, like shown here.
# By running this block of code results in a dataframe showing statistical informations like minimum, maximum, mean and sum of the weight for each gender group.

df.groupby('gender').agg(['min','max','mean','sum'])


Unnamed: 0_level_0,weight,weight,weight,weight
Unnamed: 0_level_1,min,max,mean,sum
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
f,52,58,55.0,165
m,59,65,61.25,245


# multi-column groupby: aggregation

In [None]:
# To demonstrate the multi-columns groupby aggregation, Assume, we have a following dataframe name d f containing 3 columns and 7 rows, as shown here.

import pandas as pd
dic ={'gender': ['f', 'm', 'm', 'f', 'm', 'f', 'm'],
      'weight': [58, 60, 59, 55, 65, 52, 61],
      'location': ['LA', 'LA', 'NY', 'NY', 'LA', 'NY', 'NY']
      }
df =pd.DataFrame(dic)
df

Unnamed: 0,gender,weight,location
0,f,58,LA
1,m,60,LA
2,m,59,NY
3,f,55,NY
4,m,65,LA
5,f,52,NY
6,m,61,NY


In [None]:
# Now, to perform multi-columns groupby aggregation we go through the following steps.
# First, we perform a groupby operation on multi-columns of this dataframe, in this case, we pass a list having columns gender and location.
# After that, we apply aggregation method, in this case, we perform aggregation based on mean.
# By running this line of code results a dataframe, which has been group by gender and location, and aggregated based on mean.


df.groupby(['gender', 'location']).agg(['mean'])


Unnamed: 0_level_0,Unnamed: 1_level_0,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
gender,location,Unnamed: 2_level_2
f,LA,58.0
f,NY,53.5
m,LA,62.5
m,NY,60.0
